From f07b575c2f8efb11d4ad1618661125d01a20ed29 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Fri, 18 Nov 2022 19:17:19 +0100 Subject: [PATCH 01/24] initial commit --- mpi/examples/Makefile.am | 4 + mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 234 ++++++++++++++++++ mpi/src/starpu_mpi.c | 2 + mpi/src/starpu_mpi_task_insert.c | 56 +++++ mpi/src/starpu_mpi_task_insert.h | 1 + 5 files changed, 297 insertions(+) create mode 100644 mpi/examples/mpi_redux/mpi_redux_autowrapup.c diff --git a/mpi/examples/Makefile.am b/mpi/examples/Makefile.am index deb188c254..57426efefc 100644 --- a/mpi/examples/Makefile.am +++ b/mpi/examples/Makefile.am @@ -293,15 +293,19 @@ endif examplebin_PROGRAMS += \ mpi_redux/mpi_redux \ + mpi_redux/mpi_redux_autowrapup \ mpi_redux/mpi_redux_tree mpi_redux_mpi_redux_LDADD = \ -lm +mpi_redux_mpi_redux_autowrapup_LDADD = \ + -lm mpi_redux_mpi_redux_tree_LDADD = \ -lm if !STARPU_SIMGRID starpu_mpi_EXAMPLES += \ mpi_redux/mpi_redux \ + mpi_redux/mpi_redux_autowrapup \ mpi_redux/mpi_redux_tree endif diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c new file mode 100644 index 0000000000..643fef7777 --- /dev/null +++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c @@ -0,0 +1,234 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2016-2022 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +/* + * This example illustrates how to use the STARPU_MPI_REDUX mode + * and compare it with the standard STARPU_REDUX. + * + * In order to make this comparison salliant, the init codelet is not + * a task that set the handle to a neutral element but rather depends + * on the working node. + * This is not a proper way to use a reduction pattern however it + * can be analogous to the cost/weight of each contribution. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <math.h> +#include <starpu.h> +#include <starpu_mpi.h> +#include "helper.h" +#include <unistd.h> + +static void cl_cpu_read(void *handles[], void*arg) +{ + (void) arg; + (void) handles; +} + +static struct starpu_codelet read_cl = +{ + .cpu_funcs = { cl_cpu_read }, + .nbuffers = 1, + .modes = { STARPU_R }, + .name = "task_read" +}; +static void cl_cpu_work(void *handles[], void*arg) +{ + (void)arg; + double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); + double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); + starpu_sleep(0.01); + printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); + *a = 3.0 + *a + *b; + printf("%f\n",*a); +} + +static struct starpu_codelet work_cl = +{ + .cpu_funcs = { cl_cpu_work }, + .nbuffers = 2, + .modes = { STARPU_REDUX, STARPU_R }, + .name = "task_init" +}; + +static struct starpu_codelet mpi_work_cl = +{ + .cpu_funcs = { cl_cpu_work }, + .nbuffers = 2, + .modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R }, + .name = "task_init-mpi" +}; + +static void cl_cpu_task_init(void *handles[], void*arg) +{ + (void) arg; + double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); + starpu_sleep(0.005); + printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), +#ifdef STARPU_HAVE_VALGRIND_H + RUNNING_ON_VALGRIND ? 0. : +#endif + *a); + *a = starpu_mpi_world_rank(); +} + +static struct starpu_codelet task_init_cl = +{ + .cpu_funcs = { cl_cpu_task_init }, + .nbuffers = 1, + .modes = { STARPU_W }, + .name = "task_init" +}; + +static void cl_cpu_task_red(void *handles[], void*arg) +{ + (void) arg; + double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); + double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); + starpu_sleep(0.01); + printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); + *ad = *ad + *as; +} + +static struct starpu_codelet task_red_cl = +{ + .cpu_funcs = { cl_cpu_task_red }, + .nbuffers = 2, + .modes = { STARPU_RW|STARPU_COMMUTE, STARPU_R }, + .name = "task_red" +}; + +int main(int argc, char *argv[]) +{ + int comm_rank, comm_size; + /* Initializes STarPU and the StarPU-MPI layer */ + starpu_fxt_autostart_profiling(0); + int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL); + STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft"); + + int nworkers = starpu_cpu_worker_get_count(); + if (nworkers < 2) + { + FPRINTF(stderr, "We need at least 2 CPU worker per node.\n"); + starpu_mpi_shutdown(); + return STARPU_TEST_SKIPPED; + } + printf("there are %d workers\n", nworkers); + starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size); + if (comm_size < 2) + { + FPRINTF(stderr, "We need at least 2 nodes.\n"); + starpu_mpi_shutdown(); + return STARPU_TEST_SKIPPED; + } + starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank); + + double a, b[comm_size]; + starpu_data_handle_t a_h, b_h[comm_size]; + double work_coef = 2; + enum starpu_data_access_mode task_mode; + int wrapup,i,j,work_node; + starpu_mpi_tag_t tag = 0; + for (wrapup = 0; wrapup <= 2; wrapup ++) + { + for (i = 0 ; i < 2 ; i++) + { + starpu_mpi_barrier(MPI_COMM_WORLD); + if (i==0) + task_mode = STARPU_MPI_REDUX; + else + task_mode = STARPU_REDUX; + if (comm_rank == 0) + { + a = 1.0; + printf("init a = %f\n", a); + starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double)); + for (j=0;j<comm_size;j++) + starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double)); + } + else + { + b[comm_rank] = 1.0 / (comm_rank + 1.0); + printf("init b_%d = %f\n", comm_rank, b[comm_rank]); + starpu_variable_data_register(&a_h, -1, 0, sizeof(double)); + for (j=0;j<comm_size;j++) + { + if (j == comm_rank) + starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double)); + else + starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double)); + } + } + starpu_mpi_data_register(a_h, tag++, 0); + for (j=0;j<comm_size;j++) + starpu_mpi_data_register(b_h[j], tag++, j); + + starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl); + starpu_fxt_start_profiling(); + for (work_node=1; work_node < comm_size;work_node++) + { + for (j=1;j<=work_coef*nworkers;j++) + { + if (i == 0) + starpu_mpi_task_insert(MPI_COMM_WORLD, + &mpi_work_cl, + task_mode, a_h, + STARPU_R, b_h[work_node], + STARPU_EXECUTE_ON_NODE, work_node, + 0); + else + starpu_mpi_task_insert(MPI_COMM_WORLD, + &work_cl, + task_mode, a_h, + STARPU_R, b_h[work_node], + STARPU_EXECUTE_ON_NODE, work_node, + 0); + } + } + if (wrapup == 0) + { + ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h); + STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data"); + } + else if (wrapup == 1) + { + starpu_mpi_task_insert(MPI_COMM_WORLD, + &read_cl, STARPU_R, a_h, 0); + } + starpu_mpi_wait_for_all(MPI_COMM_WORLD); + starpu_mpi_barrier(MPI_COMM_WORLD); + if (comm_rank == 0) + { + double tmp1 = 0.0; + double tmp2 = 0.0; + for (work_node = 1; work_node < comm_size ; work_node++) { + tmp1 += 1.0 / (work_node + 1.0); + tmp2 += (nworkers - 1.0)*work_node*i; + } + printf("computed result ---> %f expected %f\n", a, + 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2); + } + starpu_data_unregister(a_h); + for (work_node=0; work_node < comm_size;work_node++) + starpu_data_unregister(b_h[work_node]); + starpu_mpi_barrier(MPI_COMM_WORLD); + } + } + starpu_mpi_shutdown(); + return 0; +} diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index 487dcdea97..0eaa3b7ca0 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -22,6 +22,7 @@ #include <starpu_mpi_private.h> #include <starpu_mpi_cache.h> #include <starpu_profiling.h> +#include <starpu_mpi_task_insert.h> #include <starpu_mpi_stats.h> #include <starpu_mpi_cache.h> #include <starpu_mpi_select_node.h> @@ -645,6 +646,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r int starpu_mpi_wait_for_all(MPI_Comm comm) { + _starpu_mpi_redux_wrapup_datas(); return _mpi_backend._starpu_mpi_backend_wait_for_all(comm); } diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 521e3bea72..33f095f70c 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -21,6 +21,7 @@ #include <starpu.h> #include <starpu_data.h> #include <common/utils.h> +#include <common/uthash.h> #include <util/starpu_task_insert_utils.h> #include <datawizard/coherency.h> #include <core/task.h> @@ -42,6 +43,17 @@ static void (*pre_submit_hook)(struct starpu_task *task) = NULL; +/* reduction wrap-up */ +// entry in the table +struct _starpu_redux_data_entry +{ + UT_hash_handle hh; + starpu_data_handle_t data_handle; +}; +// the table +static struct _starpu_redux_data_entry *_redux_data = NULL; +void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle); + int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *)) { if (pre_submit_hook) @@ -651,6 +663,11 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { + struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data; + if (mpi_data->redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + { + _starpu_mpi_redux_wrapup_data(descrs[i].handle); + } _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); } @@ -690,6 +707,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc _STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0])); mpi_data->redux_map [xrank] = 1; mpi_data->redux_map [rrank] = 1; + struct _starpu_redux_data_entry *entry; + HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry); + if (entry == NULL) + { + _STARPU_MPI_MALLOC(entry, sizeof(*entry)); + starpu_data_handle_t data_handle = descrs[i].handle; + entry->data_handle = data_handle; + HASH_ADD_PTR(_redux_data, data_handle, entry); + } } _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); _starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute); @@ -968,6 +994,7 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han { arity = nb_contrib; } + arity = STARPU_MIN(arity,nb_contrib); _STARPU_MPI_DEBUG(5, "There is %d contributors\n", nb_contrib); int contributors[nb_contrib]; int reducing_node; @@ -1074,6 +1101,14 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han current_level++; #endif } + + struct _starpu_redux_data_entry *entry; + HASH_FIND_PTR(_redux_data, &data_handle, entry); + if (entry != NULL) + { + HASH_DEL(_redux_data, entry); + free(entry); + } return 0; } @@ -1112,3 +1147,24 @@ int starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, } return starpu_mpi_redux_data_prio_tree(comm, data_handle, prio, nb_contrib); } + +void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { + size_t data_size = starpu_data_get_size(data_handle); + // Small data => flat tree | binary tree + int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2); + struct _starpu_mpi_data *mpi_data = data_handle->mpi_data; + starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size); + return; +} + +void _starpu_mpi_redux_wrapup_datas() { + struct _starpu_redux_data_entry *entry = NULL, *tmp = NULL; + HASH_ITER(hh, _redux_data, entry, tmp) + { + _starpu_mpi_redux_wrapup_data(entry->data_handle); + HASH_DEL(_redux_data, entry); + free(entry); + } + return; +} + diff --git a/mpi/src/starpu_mpi_task_insert.h b/mpi/src/starpu_mpi_task_insert.h index 9453c84f34..cbfda7cdfe 100644 --- a/mpi/src/starpu_mpi_task_insert.h +++ b/mpi/src/starpu_mpi_task_insert.h @@ -27,6 +27,7 @@ extern "C" int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank); int _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm); int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data, int prio); +void _starpu_mpi_redux_wrapup_datas(); #ifdef __cplusplus } -- GitLab From 790344f05ae797765b90c4136f2577ebb4b7cfe6 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Sun, 20 Nov 2022 15:14:35 +0100 Subject: [PATCH 02/24] free/del_entry is clearer ; fix mpi_reduction test to actually use all nodes based on ownership of handles --- mpi/src/starpu_mpi.c | 5 ++++- mpi/src/starpu_mpi_task_insert.c | 29 +++++++++++++++++------------ mpi/tests/mpi_reduction.c | 1 + 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index 0eaa3b7ca0..3608fd4d54 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -453,7 +453,10 @@ void _starpu_mpi_data_clear(starpu_data_handle_t data_handle) _mpi_backend._starpu_mpi_backend_data_clear(data_handle); _starpu_mpi_cache_data_clear(data_handle); _starpu_spin_destroy(&data->coop_lock); - free(data->redux_map); + if (data->redux_map != NULL) + { + free(data->redux_map); + } data->redux_map = NULL; free(data); } diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 33f095f70c..7aaa22f37d 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -703,19 +703,17 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc int rrank = starpu_mpi_data_get_rank(descrs[i].handle); int size; starpu_mpi_comm_size(comm, &size); - if (mpi_data->redux_map == NULL) - _STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0])); - mpi_data->redux_map [xrank] = 1; - mpi_data->redux_map [rrank] = 1; - struct _starpu_redux_data_entry *entry; - HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry); - if (entry == NULL) + if (mpi_data->redux_map == NULL) { + _STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0])); + struct _starpu_redux_data_entry *entry; _STARPU_MPI_MALLOC(entry, sizeof(*entry)); starpu_data_handle_t data_handle = descrs[i].handle; entry->data_handle = data_handle; HASH_ADD_PTR(_redux_data, data_handle, entry); } + mpi_data->redux_map [xrank] = 1; + mpi_data->redux_map [rrank] = 1; } _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); _starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute); @@ -943,8 +941,6 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args, *post_sync_jobid = ((const struct _starpu_mpi_redux_data_args *) redux_data_args)->taskC_jobid; } -/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when - * * a data previously accessed in (MPI_)REDUX mode gets accessed in R mode. */ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity) { int me, rank, nb_nodes; @@ -1109,6 +1105,11 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han HASH_DEL(_redux_data, entry); free(entry); } + if (mpi_data->redux_map != NULL) // it should be allocated at this point anyway + { + free(mpi_data->redux_map); + } + mpi_data->redux_map = NULL; return 0; } @@ -1153,7 +1154,13 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { // Small data => flat tree | binary tree int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2); struct _starpu_mpi_data *mpi_data = data_handle->mpi_data; - starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size); + struct _starpu_redux_data_entry *entry; + + HASH_FIND_PTR(_redux_data, &data_handle, entry); + if (entry != NULL) + { + starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size); + } return; } @@ -1162,8 +1169,6 @@ void _starpu_mpi_redux_wrapup_datas() { HASH_ITER(hh, _redux_data, entry, tmp) { _starpu_mpi_redux_wrapup_data(entry->data_handle); - HASH_DEL(_redux_data, entry); - free(entry); } return; } diff --git a/mpi/tests/mpi_reduction.c b/mpi/tests/mpi_reduction.c index fb422a9ece..0fed07352f 100644 --- a/mpi/tests/mpi_reduction.c +++ b/mpi/tests/mpi_reduction.c @@ -164,6 +164,7 @@ int main(int argc, char **argv) &dot_codelet, STARPU_R, handles[x], STARPU_REDUX, dot_handle, + STARPU_EXECUTE_ON_DATA, handles[x], 0); } ret = starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle); -- GitLab From 01a32877a95134d9a971f0b9fc7e6445148ee284 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Sun, 20 Nov 2022 18:50:25 +0100 Subject: [PATCH 03/24] get_redux_map accounts for NULL handle --- mpi/src/starpu_mpi.c | 8 ++++++-- mpi/src/starpu_mpi_private.h | 2 +- mpi/src/starpu_mpi_task_insert.c | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index 3608fd4d54..d9552458af 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -526,8 +526,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data) char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data) { - STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data); - return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map; + if (data == NULL) + { + return NULL; + } + struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data); + return mpi_data->redux_map; } int starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg) diff --git a/mpi/src/starpu_mpi_private.h b/mpi/src/starpu_mpi_private.h index e43f848395..4e4f3768cc 100644 --- a/mpi/src/starpu_mpi_private.h +++ b/mpi/src/starpu_mpi_private.h @@ -221,7 +221,7 @@ struct _starpu_mpi_data unsigned int modified:1; // Whether the data has been modified since the registration. /** Array used to store the contributing nodes to this data - * when it is accessed in REDUX mode. */ + * when it is accessed in (MPI_)REDUX mode. */ char* redux_map; /** Rendez-vous data for opportunistic cooperative sends, diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 7aaa22f37d..8350352de6 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -663,8 +663,8 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { - struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data; - if (mpi_data->redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); + if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) { _starpu_mpi_redux_wrapup_data(descrs[i].handle); } -- GitLab From 096ea55f30852fbb70704a1896bca714a8385779 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Mon, 21 Nov 2022 17:07:16 +0100 Subject: [PATCH 04/24] fix brackets ; fix count in mpi_redux and preambule in mpi_redux_autowrap --- mpi/examples/mpi_redux/mpi_redux.c | 11 ++++++++--- mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 15 +++++++-------- mpi/src/starpu_mpi_task_insert.c | 6 ++++-- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/mpi/examples/mpi_redux/mpi_redux.c b/mpi/examples/mpi_redux/mpi_redux.c index c8f8b54c10..d203931bd3 100644 --- a/mpi/examples/mpi_redux/mpi_redux.c +++ b/mpi/examples/mpi_redux/mpi_redux.c @@ -190,10 +190,15 @@ int main(int argc, char *argv[]) starpu_mpi_barrier(MPI_COMM_WORLD); if (comm_rank == 0) { - double tmp = 0.0; + double tmp1 = 0.0; + double tmp2 = 0.0; for (work_node = 1; work_node < comm_size ; work_node++) - tmp += 1.0 / (work_node + 1.0); - printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp)); + { + tmp1 += 1.0 / (work_node + 1.0); + tmp2 += (nworkers - 1.0)*work_node*i; + } + printf("computed result ---> %f expected %f\n", a, + 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2); } starpu_data_unregister(a_h); for (work_node=0; work_node < comm_size;work_node++) diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c index 643fef7777..814725aea6 100644 --- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c +++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c @@ -15,14 +15,12 @@ */ /* - * This example illustrates how to use the STARPU_MPI_REDUX mode - * and compare it with the standard STARPU_REDUX. + * This example is similar to mpi_redux.c * - * In order to make this comparison salliant, the init codelet is not - * a task that set the handle to a neutral element but rather depends - * on the working node. - * This is not a proper way to use a reduction pattern however it - * can be analogous to the cost/weight of each contribution. + * It iterates over multiple ways to wrap-up reduction patterns : either by + * - waiting for all mpi + tasks + * - calling mpi_redux yourself + * - inserting a reading task on the handle to reduce */ #include <stdlib.h> @@ -216,7 +214,8 @@ int main(int argc, char *argv[]) { double tmp1 = 0.0; double tmp2 = 0.0; - for (work_node = 1; work_node < comm_size ; work_node++) { + for (work_node = 1; work_node < comm_size ; work_node++) + { tmp1 += 1.0 / (work_node + 1.0); tmp2 += (nworkers - 1.0)*work_node*i; } diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 8350352de6..1090bb708c 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -1149,7 +1149,8 @@ int starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, return starpu_mpi_redux_data_prio_tree(comm, data_handle, prio, nb_contrib); } -void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { +void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) +{ size_t data_size = starpu_data_get_size(data_handle); // Small data => flat tree | binary tree int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2); @@ -1164,7 +1165,8 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { return; } -void _starpu_mpi_redux_wrapup_datas() { +void _starpu_mpi_redux_wrapup_datas() +{ struct _starpu_redux_data_entry *entry = NULL, *tmp = NULL; HASH_ITER(hh, _redux_data, entry, tmp) { -- GitLab From bfa3c42c9bab0dcae1c62a4bc1d4215c2de35105 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Wed, 23 Nov 2022 15:22:52 +0100 Subject: [PATCH 05/24] doc --- .../chapters/starpu_basics/data_management.doxy | 9 +++++++++ .../starpu_installation/environment_variables.doxy | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy index 31aa035271..8e5ddbf7c9 100644 --- a/doc/doxygen/chapters/starpu_basics/data_management.doxy +++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy @@ -490,6 +490,15 @@ for (i = 0; i < 100; i++) } \endcode +starpu_mpi_redux_data() is called automatically in various cases, including +when a task reading the reduced handle is inserted. The previous example could +avoid calling starpu_mpi_redux_data. Default priority (0) is used. The +reduction tree arity is decided based on the size of the data to reduce: a flat tree +is used with a small data (less than 1024 bytes), a binary tree otherwise. If +the environment variable STARPU_MPI_REDUX_TREESIZE is setup, then this value is +read to decide the arity of the reduction tree. Distributed-memory reduction +patterns are wrapped-up at the end of an application when calling starpu_mpi_wait_for_all(). + \section DataCommute Commute Data Access By default, the implicit dependencies computed from data access use the diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy index d40c7d0685..2958bc79d7 100644 --- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy +++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy @@ -656,6 +656,16 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default 1, StarPU-MPI will warn if MPI does not provide the GPUDirect support. </dd> +<dt>STARPU_MPI_REDUXTREE_SIZE</dt> +<dd> +\anchor STARPU_MPI_REDUXTREE_SIZE +\addindex __env__STARPU_MPI_REDUXTREE_SIZE +This variable allows to tune the arity of the distributed-memory reduction trees +detected by StarPU. If not set, the arity of the tree depends on the size of the +data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary +tree is used otherwise. +</dd> + </dl> -- GitLab From bf9071df0469c0dfb72f7b2a9715c8813a602624 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Wed, 23 Nov 2022 18:20:38 +0100 Subject: [PATCH 06/24] howto setup --- .../chapters/starpu_installation/environment_variables.doxy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy index 2958bc79d7..789985f81f 100644 --- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy +++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy @@ -663,7 +663,7 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default This variable allows to tune the arity of the distributed-memory reduction trees detected by StarPU. If not set, the arity of the tree depends on the size of the data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary -tree is used otherwise. +tree is used otherwise. When setting this variable, the same reasoning can be applied. </dd> </dl> -- GitLab From 6ea5c9905d8af77a829228ba700dfa66cdc1a525 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 24 Nov 2022 13:54:28 +0100 Subject: [PATCH 07/24] updated automatic arity selection --- .../environment_variables.doxy | 20 +++++++++++-------- mpi/src/starpu_mpi_task_insert.c | 7 ++++++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy index 789985f81f..1ca8b51bd2 100644 --- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy +++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy @@ -656,14 +656,18 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default 1, StarPU-MPI will warn if MPI does not provide the GPUDirect support. </dd> -<dt>STARPU_MPI_REDUXTREE_SIZE</dt> -<dd> -\anchor STARPU_MPI_REDUXTREE_SIZE -\addindex __env__STARPU_MPI_REDUXTREE_SIZE -This variable allows to tune the arity of the distributed-memory reduction trees -detected by StarPU. If not set, the arity of the tree depends on the size of the -data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary -tree is used otherwise. When setting this variable, the same reasoning can be applied. +<dt>STARPU_MPI_REDUX_ARITY_THRESHOLD</dt> +<dd> +\anchor STARPU_MPI_REDUX_ARITY_THRESHOLD +\addindex __env__STARPU_MPI_REDUX_ARITY_THRESHOLD +The arity of the automatically detected reduction trees follows the following +rule: when the data to be reduced is of small size a flat tree is unrolled +i.e. all the contributing nodes send their contribution to the root of +the reduction. When the data to be reduced is of big size, a binary tree is used instead. +The default threshold between flat and binary tree is 1024 bytes. By setting this +value to be negative, all the automatically detected reduction trees will use flat trees. +If this value is set to 0, then binary trees will always be selected. Otherwise, +the setup value replaces the default 1024. </dd> </dl> diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 1090bb708c..dd311101a7 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -1153,7 +1153,12 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { size_t data_size = starpu_data_get_size(data_handle); // Small data => flat tree | binary tree - int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2); + int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024); + int _starpu_mpi_redux_tree_size = 2; + if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold)) + { + _starpu_mpi_redux_tree_size = STARPU_MAXNODES; + } struct _starpu_mpi_data *mpi_data = data_handle->mpi_data; struct _starpu_redux_data_entry *entry; -- GitLab From fb88b4920ef6d77a2f497e5d36a462a2aa983167 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Fri, 25 Nov 2022 00:07:57 +0100 Subject: [PATCH 08/24] env variable doc --- .../chapters/starpu_basics/data_management.doxy | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy index 8e5ddbf7c9..b4f25bd8f4 100644 --- a/doc/doxygen/chapters/starpu_basics/data_management.doxy +++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy @@ -492,12 +492,15 @@ for (i = 0; i < 100; i++) starpu_mpi_redux_data() is called automatically in various cases, including when a task reading the reduced handle is inserted. The previous example could -avoid calling starpu_mpi_redux_data. Default priority (0) is used. The +avoid calling starpu_mpi_redux_data(). Default priority (0) is used. The reduction tree arity is decided based on the size of the data to reduce: a flat tree -is used with a small data (less than 1024 bytes), a binary tree otherwise. If -the environment variable STARPU_MPI_REDUX_TREESIZE is setup, then this value is -read to decide the arity of the reduction tree. Distributed-memory reduction -patterns are wrapped-up at the end of an application when calling starpu_mpi_wait_for_all(). +is used with a small data (default to less than 1024 bytes), a binary tree otherwise. If +the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the threshold +between the size of a small data and a bigger data is modified. If the value is setup +to be negative, flat trees will always be used. If the value is setup to 0, binary +trees are used. Otherwise, the size of the data is compared to the size in the +environment variable. Remaining distributed-memory reduction patterns are wrapped-up +at the end of an application when calling starpu_mpi_wait_for_all(). \section DataCommute Commute Data Access -- GitLab From 9314cf25c9e3edade62b0b336345c1a3bcc6be8d Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Fri, 18 Nov 2022 19:17:19 +0100 Subject: [PATCH 09/24] initial commit --- mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 28 ++++----- mpi/src/starpu_mpi_task_insert.c | 57 +++++++++++-------- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c index 814725aea6..5dee0344e5 100644 --- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c +++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c @@ -183,22 +183,22 @@ int main(int argc, char *argv[]) for (j=1;j<=work_coef*nworkers;j++) { if (i == 0) - starpu_mpi_task_insert(MPI_COMM_WORLD, - &mpi_work_cl, - task_mode, a_h, - STARPU_R, b_h[work_node], - STARPU_EXECUTE_ON_NODE, work_node, - 0); + starpu_mpi_task_insert(MPI_COMM_WORLD, + &mpi_work_cl, + task_mode, a_h, + STARPU_R, b_h[work_node], + STARPU_EXECUTE_ON_NODE, work_node, + 0); else - starpu_mpi_task_insert(MPI_COMM_WORLD, - &work_cl, - task_mode, a_h, - STARPU_R, b_h[work_node], - STARPU_EXECUTE_ON_NODE, work_node, - 0); + starpu_mpi_task_insert(MPI_COMM_WORLD, + &work_cl, + task_mode, a_h, + STARPU_R, b_h[work_node], + STARPU_EXECUTE_ON_NODE, work_node, + 0); } } - if (wrapup == 0) + if (wrapup == 0) { ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h); STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data"); @@ -206,7 +206,7 @@ int main(int argc, char *argv[]) else if (wrapup == 1) { starpu_mpi_task_insert(MPI_COMM_WORLD, - &read_cl, STARPU_R, a_h, 0); + &read_cl, STARPU_R, a_h, 0); } starpu_mpi_wait_for_all(MPI_COMM_WORLD); starpu_mpi_barrier(MPI_COMM_WORLD); diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index dd311101a7..71052dffa9 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -620,6 +620,30 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru if (ret < 0) return ret; + _STARPU_TRACE_TASK_MPI_PRE_START(); + /* Send and receive data as requested */ + for(i=0 ; i<nb_data ; i++) + { + struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data; + if (mpi_data->redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + { + _starpu_mpi_redux_wrapup_data(descrs[i].handle); + } + _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); + } + + if (xrank_p) + *xrank_p = xrank; + if (nb_data_p) + *nb_data_p = nb_data; + if (prio_p) + *prio_p = prio; + + if (descrs_p) + *descrs_p = descrs; + else + free(descrs); + if (do_execute == 1) { va_list varg_list_copy; @@ -659,30 +683,6 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru } } - _STARPU_TRACE_TASK_MPI_PRE_START(); - /* Send and receive data as requested */ - for(i=0 ; i<nb_data ; i++) - { - char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); - if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) - { - _starpu_mpi_redux_wrapup_data(descrs[i].handle); - } - _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); - } - - if (xrank_p) - *xrank_p = xrank; - if (nb_data_p) - *nb_data_p = nb_data; - if (prio_p) - *prio_p = prio; - - if (descrs_p) - *descrs_p = descrs; - else - free(descrs); - _STARPU_TRACE_TASK_MPI_PRE_END(); return do_execute; @@ -714,6 +714,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc } mpi_data->redux_map [xrank] = 1; mpi_data->redux_map [rrank] = 1; + struct _starpu_redux_data_entry *entry; + HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry); + if (entry == NULL) + { + _STARPU_MPI_MALLOC(entry, sizeof(*entry)); + starpu_data_handle_t data_handle = descrs[i].handle; + entry->data_handle = data_handle; + HASH_ADD_PTR(_redux_data, data_handle, entry); + } } _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); _starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute); -- GitLab From 9022a65507a3de551ad2538aa731baae495e911e Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Sun, 20 Nov 2022 15:14:35 +0100 Subject: [PATCH 10/24] free/del_entry is clearer ; fix mpi_reduction test to actually use all nodes based on ownership of handles --- mpi/src/starpu_mpi_task_insert.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 71052dffa9..fd828f91b5 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -718,11 +718,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry); if (entry == NULL) { + _STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0])); + struct _starpu_redux_data_entry *entry; _STARPU_MPI_MALLOC(entry, sizeof(*entry)); starpu_data_handle_t data_handle = descrs[i].handle; entry->data_handle = data_handle; HASH_ADD_PTR(_redux_data, data_handle, entry); } + mpi_data->redux_map [xrank] = 1; + mpi_data->redux_map [rrank] = 1; } _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); _starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute); -- GitLab From 0fa4485518a0794266437828ba1804ab9b9dec71 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Sun, 20 Nov 2022 18:50:25 +0100 Subject: [PATCH 11/24] get_redux_map accounts for NULL handle --- mpi/src/starpu_mpi_task_insert.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index fd828f91b5..e31317b1c3 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -624,8 +624,8 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { - struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data; - if (mpi_data->redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); + if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) { _starpu_mpi_redux_wrapup_data(descrs[i].handle); } -- GitLab From cbd223e14455fd2b4a867971822f9606304a66da Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Sat, 26 Nov 2022 15:56:22 +0100 Subject: [PATCH 12/24] int -> size_t relevant --- mpi/src/starpu_mpi_task_insert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index e31317b1c3..b8b1e3d321 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -1166,7 +1166,7 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { size_t data_size = starpu_data_get_size(data_handle); // Small data => flat tree | binary tree - int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024); + size_t _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024); int _starpu_mpi_redux_tree_size = 2; if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold)) { -- GitLab From 766d67554d8d4763b917c98fd06052ae3cac17cd Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 1 Dec 2022 16:43:29 +0100 Subject: [PATCH 13/24] updated wrt samuel's review --- .../starpu_basics/data_management.doxy | 32 ++++++++++--------- .../starpu_extensions/mpi_support.doxy | 7 ++-- mpi/src/starpu_mpi.c | 8 ++--- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy index b4f25bd8f4..dfc0038417 100644 --- a/doc/doxygen/chapters/starpu_basics/data_management.doxy +++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy @@ -473,12 +473,14 @@ leading to yet more relaxed dependencies and more parallelism. ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI case. This will however not produce any MPI communication, but just pass -::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the -application to call starpu_mpi_redux_data(), which posts tasks which will -reduce the partial results among MPI nodes into the MPI node which owns the -data. For instance, some hypothetical application which collects partial results +::STARPU_REDUX to the underlying starpu_task_insert(). starpu_mpi_redux_data() +posts tasks which will reduce the partial results among MPI nodes into the MPI +node which owns the data. The function can be called by the user to benefit from +fine-tuning such as priority setting. If the user does not call this function, +StarPU wraps up reduction patterns automatically. The following example +shows a hypothetical application which collects partial results into data <c>res</c>, then uses it for other computation, before looping again -with a new reduction: +with a new reduction where the wrap-up of the reduction pattern is explicit: \code{.c} for (i = 0; i < 100; i++) @@ -491,16 +493,16 @@ for (i = 0; i < 100; i++) \endcode starpu_mpi_redux_data() is called automatically in various cases, including -when a task reading the reduced handle is inserted. The previous example could -avoid calling starpu_mpi_redux_data(). Default priority (0) is used. The -reduction tree arity is decided based on the size of the data to reduce: a flat tree -is used with a small data (default to less than 1024 bytes), a binary tree otherwise. If -the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the threshold -between the size of a small data and a bigger data is modified. If the value is setup -to be negative, flat trees will always be used. If the value is setup to 0, binary -trees are used. Otherwise, the size of the data is compared to the size in the -environment variable. Remaining distributed-memory reduction patterns are wrapped-up -at the end of an application when calling starpu_mpi_wait_for_all(). +when a task reading the reduced handle is inserted through starpu_mpi_task_insert(). +The previous example could avoid calling starpu_mpi_redux_data(). Default priority (0) +is used. The reduction tree arity is decided based on the size of the data to reduce: a +flat tree is used with a small data (default to less than 1024 bytes), a binary tree +otherwise. If the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the +threshold between the size of a small data and a bigger data is modified. If the value is +setup to be negative, flat trees will always be used. If the value is setup to 0, binary +trees are used. Otherwise, the size of the data is compared to the size in the environment +variable. Remaining distributed-memory reduction patterns are wrapped-up at the end of an +application when calling starpu_mpi_wait_for_all(). \section DataCommute Commute Data Access diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy index 5181e291ba..d33fe10009 100644 --- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy +++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy @@ -824,9 +824,10 @@ tells StarPU to spawn only one contribution per contributing node. The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the initialization and reduction codelets should be declared through starpu_data_set_reduction_methods() in the -same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() has to be called -when all tasks contributing to the reduction have been inserted so that the reduction tree -can be submitted. Tasks contributing to the inter-node reduction should be registered as +same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called +either when a task reading the reduced handle is inserted through the MPI layer of StarPU through +starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed through starpu_mpi_wait_for_all(). +Tasks contributing to the inter-node reduction should be registered as accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the \c STARPU_REDUX mode, as in the following example. diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index d9552458af..91bdc0aec7 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -453,10 +453,7 @@ void _starpu_mpi_data_clear(starpu_data_handle_t data_handle) _mpi_backend._starpu_mpi_backend_data_clear(data_handle); _starpu_mpi_cache_data_clear(data_handle); _starpu_spin_destroy(&data->coop_lock); - if (data->redux_map != NULL) - { - free(data->redux_map); - } + free(data->redux_map); data->redux_map = NULL; free(data); } @@ -530,6 +527,7 @@ char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data) { return NULL; } + STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data); struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data); return mpi_data->redux_map; } @@ -653,6 +651,8 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r int starpu_mpi_wait_for_all(MPI_Comm comm) { + /* If the user forgets to call mpi_redux_data or insert R tasks on the reduced handles */ + /* then, we wrap reduction patterns for them. This is typical of benchmarks */ _starpu_mpi_redux_wrapup_datas(); return _mpi_backend._starpu_mpi_backend_wait_for_all(comm); } -- GitLab From e300dbc80b639623c3265388b8dd9b0d5bc0b712 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 1 Dec 2022 18:45:10 +0100 Subject: [PATCH 14/24] fix redux_map use --- mpi/src/starpu_mpi.c | 3 +-- mpi/src/starpu_mpi_task_insert.c | 11 +---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index 91bdc0aec7..300f79a2a3 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -523,11 +523,10 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data) char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data) { - if (data == NULL) + if (data == NULL || data->mpi_data == NULL) { return NULL; } - STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data); struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data); return mpi_data->redux_map; } diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index b8b1e3d321..2ef727fbaf 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -624,11 +624,6 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { - char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); - if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) - { - _starpu_mpi_redux_wrapup_data(descrs[i].handle); - } _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); } @@ -719,7 +714,6 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc if (entry == NULL) { _STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0])); - struct _starpu_redux_data_entry *entry; _STARPU_MPI_MALLOC(entry, sizeof(*entry)); starpu_data_handle_t data_handle = descrs[i].handle; entry->data_handle = data_handle; @@ -1118,10 +1112,7 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han HASH_DEL(_redux_data, entry); free(entry); } - if (mpi_data->redux_map != NULL) // it should be allocated at this point anyway - { - free(mpi_data->redux_map); - } + free(mpi_data->redux_map); mpi_data->redux_map = NULL; return 0; } -- GitLab From 9b2a19ef385ac1b8b5156dc7baf36124a3661aba Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 1 Dec 2022 19:13:30 +0100 Subject: [PATCH 15/24] fix redux before exchange data --- mpi/src/starpu_mpi_task_insert.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 2ef727fbaf..7001addd35 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -624,9 +624,15 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { + char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); + if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + { + _starpu_mpi_redux_wrapup_data(descrs[i].handle); + } _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); } + if (xrank_p) *xrank_p = xrank; if (nb_data_p) -- GitLab From 2ba8df21e16fe3581aa459f5e69d2c63443d3a11 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 1 Dec 2022 19:15:46 +0100 Subject: [PATCH 16/24] size_t -> int but cast if int is unsigned --- mpi/src/starpu_mpi_task_insert.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 7001addd35..2221659215 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -1163,9 +1163,9 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) { size_t data_size = starpu_data_get_size(data_handle); // Small data => flat tree | binary tree - size_t _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024); + int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024); int _starpu_mpi_redux_tree_size = 2; - if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold)) + if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < (size_t) _starpu_mpi_redux_threshold)) { _starpu_mpi_redux_tree_size = STARPU_MAXNODES; } -- GitLab From 6b97adfa1337c980ee2b201a64f89c77d4e86d0b Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Thu, 1 Dec 2022 23:05:51 +0100 Subject: [PATCH 17/24] exchange before, before free descrs --- mpi/src/starpu_mpi.c | 8 ++------ mpi/src/starpu_mpi_task_insert.c | 10 ++++++---- mpi/tests/policy_register.c | 1 - 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c index 300f79a2a3..5bb0e5e060 100644 --- a/mpi/src/starpu_mpi.c +++ b/mpi/src/starpu_mpi.c @@ -523,12 +523,8 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data) char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data) { - if (data == NULL || data->mpi_data == NULL) - { - return NULL; - } - struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data); - return mpi_data->redux_map; + STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data); + return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map; } int starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg) diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c index 2221659215..12a6b16d52 100644 --- a/mpi/src/starpu_mpi_task_insert.c +++ b/mpi/src/starpu_mpi_task_insert.c @@ -624,15 +624,17 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru /* Send and receive data as requested */ for(i=0 ; i<nb_data ; i++) { - char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); - if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + if (descrs[i].handle && descrs[i].handle->mpi_data) { - _starpu_mpi_redux_wrapup_data(descrs[i].handle); + char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle); + if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX )) + { + _starpu_mpi_redux_wrapup_data(descrs[i].handle); + } } _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm); } - if (xrank_p) *xrank_p = xrank; if (nb_data_p) diff --git a/mpi/tests/policy_register.c b/mpi/tests/policy_register.c index 6b7af3c187..6549f74555 100644 --- a/mpi/tests/policy_register.c +++ b/mpi/tests/policy_register.c @@ -92,7 +92,6 @@ int main(int argc, char **argv) policy = starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_my_policy_1); starpu_mpi_node_selection_set_current_policy(policy); - task = starpu_mpi_task_build(MPI_COMM_WORLD, &mycodelet, STARPU_W, handles[0], STARPU_W, handles[1], 0); -- GitLab From 9f9799968eddb4e37795c650cef61575ac19e348 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Fri, 2 Dec 2022 10:47:47 +0100 Subject: [PATCH 18/24] automatic/user precision --- doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy index d33fe10009..9ee4dd511e 100644 --- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy +++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy @@ -826,7 +826,9 @@ The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the ini reduction codelets should be declared through starpu_data_set_reduction_methods() in the same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called either when a task reading the reduced handle is inserted through the MPI layer of StarPU through -starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed through starpu_mpi_wait_for_all(). +starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed +through starpu_mpi_wait_for_all(). The function can be called by the user to fine-tune arguments +such as the priority of the reduction tasks. Tasks contributing to the inter-node reduction should be registered as accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the \c STARPU_REDUX mode, as in the following example. -- GitLab From 84d9efc1d19bcb47b826fcb6fed01683e8493c56 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Mon, 5 Dec 2022 14:06:02 +0100 Subject: [PATCH 19/24] fix link + revert policy_register --- doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 2 +- mpi/tests/policy_register.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy index 9ee4dd511e..a2dad06d7e 100644 --- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy +++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy @@ -822,7 +822,7 @@ core on contributing nodes spawns their own copy to work with. In the case that required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution per contributing node. -The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the initialization and +The setup and use of :: STARPU_MPI_REDUX is similar to :: STARPU_REDUX : the initialization and reduction codelets should be declared through starpu_data_set_reduction_methods() in the same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called either when a task reading the reduced handle is inserted through the MPI layer of StarPU through diff --git a/mpi/tests/policy_register.c b/mpi/tests/policy_register.c index 6549f74555..6b7af3c187 100644 --- a/mpi/tests/policy_register.c +++ b/mpi/tests/policy_register.c @@ -92,6 +92,7 @@ int main(int argc, char **argv) policy = starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_my_policy_1); starpu_mpi_node_selection_set_current_policy(policy); + task = starpu_mpi_task_build(MPI_COMM_WORLD, &mycodelet, STARPU_W, handles[0], STARPU_W, handles[1], 0); -- GitLab From 6e54c9ae736a223d23e1ca0c7fba716df4173c1a Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Mon, 5 Dec 2022 16:23:00 +0100 Subject: [PATCH 20/24] ':: '->'::' --- doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy index a2dad06d7e..21805e7ed1 100644 --- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy +++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy @@ -822,7 +822,7 @@ core on contributing nodes spawns their own copy to work with. In the case that required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution per contributing node. -The setup and use of :: STARPU_MPI_REDUX is similar to :: STARPU_REDUX : the initialization and +The setup and use of ::STARPU_MPI_REDUX is similar to ::STARPU_REDUX : the initialization and reduction codelets should be declared through starpu_data_set_reduction_methods() in the same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called either when a task reading the reduced handle is inserted through the MPI layer of StarPU through -- GitLab From b734b0768147ea73f71616ba12f32c80482026ff Mon Sep 17 00:00:00 2001 From: Nathalie Furmento <nathalie.furmento@labri.fr> Date: Mon, 5 Dec 2022 16:29:59 +0100 Subject: [PATCH 21/24] doc: small typos --- .../chapters/starpu_extensions/mpi_support.doxy | 14 +++++++------- .../starpu_installation/environment_variables.doxy | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy index 21805e7ed1..eb50d92d15 100644 --- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy +++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy @@ -817,21 +817,21 @@ The data can then be used just like per-node above. \section MPIMpiRedux Inter-node reduction One might want to leverage a reduction pattern across several nodes. -Using \c STARPU_REDUX (see \ref DataReduction), one can obtain such patterns where each +Using ::STARPU_REDUX (see \ref DataReduction), one can obtain such patterns where each core on contributing nodes spawns their own copy to work with. In the case that the -required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX +required reductions are too numerous and expensive, the access mode ::STARPU_MPI_REDUX tells StarPU to spawn only one contribution per contributing node. The setup and use of ::STARPU_MPI_REDUX is similar to ::STARPU_REDUX : the initialization and reduction codelets should be declared through starpu_data_set_reduction_methods() in the -same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called +same fashion as ::STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called either when a task reading the reduced handle is inserted through the MPI layer of StarPU through -starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed +starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed through starpu_mpi_wait_for_all(). The function can be called by the user to fine-tune arguments such as the priority of the reduction tasks. Tasks contributing to the inter-node reduction should be registered as -accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the -\c STARPU_REDUX mode, as in the following example. +accessing the contribution through ::STARPU_RW|::STARPU_COMMUTE mode, as for the +::STARPU_REDUX mode, as in the following example. \code{.c} static struct starpu_codelet contrib_cl = @@ -853,7 +853,7 @@ starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, STAR Note that if the specified node is set to \c -1, the option is ignored. -More examples are available at mpi/examples/mpi_redux/mpi_redux.c and mpi/examples/mpi_redux/mpi_redux_tree.c. +More examples are available at \c mpi/examples/mpi_redux/mpi_redux.c and \c mpi/examples/mpi_redux/mpi_redux_tree.c. \section MPIPriorities Priorities diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy index 1ca8b51bd2..b234925dd7 100644 --- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy +++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy @@ -660,14 +660,14 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default <dd> \anchor STARPU_MPI_REDUX_ARITY_THRESHOLD \addindex __env__STARPU_MPI_REDUX_ARITY_THRESHOLD -The arity of the automatically detected reduction trees follows the following +The arity of the automatically-detected reduction trees follows the following rule: when the data to be reduced is of small size a flat tree is unrolled i.e. all the contributing nodes send their contribution to the root of the reduction. When the data to be reduced is of big size, a binary tree is used instead. -The default threshold between flat and binary tree is 1024 bytes. By setting this -value to be negative, all the automatically detected reduction trees will use flat trees. +The default threshold between flat and binary tree is 1024 bytes. By setting the environment +variable with a negative value, all the automatically detected reduction trees will use flat trees. If this value is set to 0, then binary trees will always be selected. Otherwise, -the setup value replaces the default 1024. +the setup value replaces the default 1024. </dd> </dl> -- GitLab From 633ef8e0ab51f28f104308eeebe00692eab87c1c Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Mon, 5 Dec 2022 16:46:37 +0100 Subject: [PATCH 22/24] changelog: small change _ automatic wrapup / small feature _ env var ARITY THRESHOLD --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 68972eaf94..4580c1b26b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -130,6 +130,8 @@ Small features: * Add STARPU_SIMGRID_TASK_PUSH_COST environment variable. * Add starpu_memory_nodes_get_count_by_kind and starpu_memory_node_get_ids_by_type. + * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of three used in + distributed-memory reduction patterns that are automatically detected. Changes: * The redux codelet should expose the STARPU_COMMUTE flag, since StarPU @@ -161,6 +163,8 @@ Small changes: unsigned to int, to explicit that it may be -1. * Value 0 for STARPU_MPI_NDETACHED_SEND and STARPU_MPI_NREADY_PROCESS will now disable their behaviour. + * Distrbuted-memory reduction patterns are automatically wrapped-up if the user + do not call starpu_mpi_redux_data StarPU 1.3.10 ==================================================================== -- GitLab From 1571eef57554dd0de2edc0a1b086a706269ef4c7 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Mon, 5 Dec 2022 16:50:08 +0100 Subject: [PATCH 23/24] typo --- ChangeLog | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4580c1b26b..7bba22233e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -130,7 +130,7 @@ Small features: * Add STARPU_SIMGRID_TASK_PUSH_COST environment variable. * Add starpu_memory_nodes_get_count_by_kind and starpu_memory_node_get_ids_by_type. - * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of three used in + * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of tree used in distributed-memory reduction patterns that are automatically detected. Changes: @@ -163,8 +163,8 @@ Small changes: unsigned to int, to explicit that it may be -1. * Value 0 for STARPU_MPI_NDETACHED_SEND and STARPU_MPI_NREADY_PROCESS will now disable their behaviour. - * Distrbuted-memory reduction patterns are automatically wrapped-up if the user - do not call starpu_mpi_redux_data + * Distributed-memory reduction patterns are automatically wrapped-up if the user + do not call starpu_mpi_redux_data() StarPU 1.3.10 ==================================================================== -- GitLab From dd047ad65b6935a6e1580d40bfe55a2889761f74 Mon Sep 17 00:00:00 2001 From: Nathalie Furmento <nathalie.furmento@labri.fr> Date: Wed, 4 Jan 2023 12:47:09 +0100 Subject: [PATCH 24/24] mpi/examples/mpi_redux/: fix printing --- mpi/examples/mpi_redux/mpi_redux.c | 16 ++++++------- mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 24 +++++++++---------- mpi/examples/mpi_redux/mpi_redux_tree.c | 16 ++++++------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/mpi/examples/mpi_redux/mpi_redux.c b/mpi/examples/mpi_redux/mpi_redux.c index d203931bd3..bb445dff67 100644 --- a/mpi/examples/mpi_redux/mpi_redux.c +++ b/mpi/examples/mpi_redux/mpi_redux.c @@ -1,6 +1,6 @@ /* StarPU --- Runtime system for heterogeneous multicore architectures. * - * Copyright (C) 2016-2022 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2016-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -40,9 +40,9 @@ static void cl_cpu_work(void *handles[], void*arg) double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); + FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); *a = 3.0 + *a + *b; - printf("%f\n",*a); + FPRINTF(stderr, "%f\n",*a); } static struct starpu_codelet work_cl = @@ -66,7 +66,7 @@ static void cl_cpu_task_init(void *handles[], void*arg) (void) arg; double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); starpu_sleep(0.005); - printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), + FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), #ifdef STARPU_HAVE_VALGRIND_H RUNNING_ON_VALGRIND ? 0. : #endif @@ -88,7 +88,7 @@ static void cl_cpu_task_red(void *handles[], void*arg) double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); + FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); *ad = *ad + *as; } @@ -140,7 +140,7 @@ int main(int argc, char *argv[]) if (comm_rank == 0) { a = 1.0; - printf("init a = %f\n", a); + FPRINTF(stderr, "init a = %f\n", a); starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double)); for (j=0;j<comm_size;j++) starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double)); @@ -148,7 +148,7 @@ int main(int argc, char *argv[]) else { b[comm_rank] = 1.0 / (comm_rank + 1.0); - printf("init b_%d = %f\n", comm_rank, b[comm_rank]); + FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]); starpu_variable_data_register(&a_h, -1, 0, sizeof(double)); for (j=0;j<comm_size;j++) { @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) tmp1 += 1.0 / (work_node + 1.0); tmp2 += (nworkers - 1.0)*work_node*i; } - printf("computed result ---> %f expected %f\n", a, + FPRINTF(stderr, "computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2); } starpu_data_unregister(a_h); diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c index 5dee0344e5..386c892c1f 100644 --- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c +++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c @@ -1,6 +1,6 @@ /* StarPU --- Runtime system for heterogeneous multicore architectures. * - * Copyright (C) 2016-2022 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2016-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -20,7 +20,7 @@ * It iterates over multiple ways to wrap-up reduction patterns : either by * - waiting for all mpi + tasks * - calling mpi_redux yourself - * - inserting a reading task on the handle to reduce + * - inserting a reading task on the handle to reduce */ #include <stdlib.h> @@ -51,9 +51,9 @@ static void cl_cpu_work(void *handles[], void*arg) double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); + FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); *a = 3.0 + *a + *b; - printf("%f\n",*a); + FPRINTF(stderr, "%f\n",*a); } static struct starpu_codelet work_cl = @@ -77,7 +77,7 @@ static void cl_cpu_task_init(void *handles[], void*arg) (void) arg; double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); starpu_sleep(0.005); - printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), + FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), #ifdef STARPU_HAVE_VALGRIND_H RUNNING_ON_VALGRIND ? 0. : #endif @@ -99,7 +99,7 @@ static void cl_cpu_task_red(void *handles[], void*arg) double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); + FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); *ad = *ad + *as; } @@ -126,7 +126,7 @@ int main(int argc, char *argv[]) starpu_mpi_shutdown(); return STARPU_TEST_SKIPPED; } - printf("there are %d workers\n", nworkers); + FPRINTF(stderr, "there are %d workers\n", nworkers); starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size); if (comm_size < 2) { @@ -154,7 +154,7 @@ int main(int argc, char *argv[]) if (comm_rank == 0) { a = 1.0; - printf("init a = %f\n", a); + FPRINTF(stderr, "init a = %f\n", a); starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double)); for (j=0;j<comm_size;j++) starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double)); @@ -162,7 +162,7 @@ int main(int argc, char *argv[]) else { b[comm_rank] = 1.0 / (comm_rank + 1.0); - printf("init b_%d = %f\n", comm_rank, b[comm_rank]); + FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]); starpu_variable_data_register(&a_h, -1, 0, sizeof(double)); for (j=0;j<comm_size;j++) { @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) 0); } } - if (wrapup == 0) + if (wrapup == 0) { ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h); STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data"); @@ -214,12 +214,12 @@ int main(int argc, char *argv[]) { double tmp1 = 0.0; double tmp2 = 0.0; - for (work_node = 1; work_node < comm_size ; work_node++) + for (work_node = 1; work_node < comm_size ; work_node++) { tmp1 += 1.0 / (work_node + 1.0); tmp2 += (nworkers - 1.0)*work_node*i; } - printf("computed result ---> %f expected %f\n", a, + FPRINTF(stderr, "computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2); } starpu_data_unregister(a_h); diff --git a/mpi/examples/mpi_redux/mpi_redux_tree.c b/mpi/examples/mpi_redux/mpi_redux_tree.c index 0dc301f3ae..7588d97a85 100644 --- a/mpi/examples/mpi_redux/mpi_redux_tree.c +++ b/mpi/examples/mpi_redux/mpi_redux_tree.c @@ -1,6 +1,6 @@ /* StarPU --- Runtime system for heterogeneous multicore architectures. * - * Copyright (C) 2016-2022 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2016-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -40,9 +40,9 @@ static void cl_cpu_work(void *handles[], void*arg) double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); + FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a); *a = 3.0 + *a + *b; - printf("%f\n",*a); + FPRINTF(stderr, "%f\n",*a); } static struct starpu_codelet work_cl = @@ -66,7 +66,7 @@ static void cl_cpu_task_init(void *handles[], void*arg) (void) arg; double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); starpu_sleep(0.005); - printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), + FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), #ifdef STARPU_HAVE_VALGRIND_H RUNNING_ON_VALGRIND ? 0. : #endif @@ -88,7 +88,7 @@ static void cl_cpu_task_red(void *handles[], void*arg) double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]); double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]); starpu_sleep(0.01); - printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); + FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad); *ad = *ad + *as; } @@ -131,7 +131,7 @@ int main(int argc, char *argv[]) if (comm_rank == 0) { a = 1.0; - printf("init a = %f\n", a); + FPRINTF(stderr, "init a = %f\n", a); starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double)); for (j=0;j<comm_size;j++) starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double)); @@ -139,7 +139,7 @@ int main(int argc, char *argv[]) else { b[comm_rank] = 1.0 / (comm_rank + 1.0); - printf("init b_%d = %f\n", comm_rank, b[comm_rank]); + FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]); starpu_variable_data_register(&a_h, -1, 0, sizeof(double)); for (j=0;j<comm_size;j++) { @@ -176,7 +176,7 @@ int main(int argc, char *argv[]) double tmp = 0.0; for (work_node = 1; work_node < comm_size ; work_node++) tmp += 1.0 / (work_node + 1.0); - printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp)); + FPRINTF(stderr, "computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp)); } starpu_data_unregister(a_h); for (work_node=0; work_node < comm_size;work_node++) -- GitLab