From f07b575c2f8efb11d4ad1618661125d01a20ed29 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Fri, 18 Nov 2022 19:17:19 +0100
Subject: [PATCH 01/24] initial commit

---
 mpi/examples/Makefile.am                      |   4 +
 mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 234 ++++++++++++++++++
 mpi/src/starpu_mpi.c                          |   2 +
 mpi/src/starpu_mpi_task_insert.c              |  56 +++++
 mpi/src/starpu_mpi_task_insert.h              |   1 +
 5 files changed, 297 insertions(+)
 create mode 100644 mpi/examples/mpi_redux/mpi_redux_autowrapup.c

diff --git a/mpi/examples/Makefile.am b/mpi/examples/Makefile.am
index deb188c254..57426efefc 100644
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -293,15 +293,19 @@ endif
 
 examplebin_PROGRAMS +=				\
 	mpi_redux/mpi_redux     		\
+	mpi_redux/mpi_redux_autowrapup  	\
 	mpi_redux/mpi_redux_tree
 
 mpi_redux_mpi_redux_LDADD =			\
 	-lm
+mpi_redux_mpi_redux_autowrapup_LDADD =			\
+	-lm
 mpi_redux_mpi_redux_tree_LDADD =		\
 	-lm
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	mpi_redux/mpi_redux                     \
+	mpi_redux/mpi_redux_autowrapup  	\
 	mpi_redux/mpi_redux_tree
 endif
 
diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
new file mode 100644
index 0000000000..643fef7777
--- /dev/null
+++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
@@ -0,0 +1,234 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016-2022  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example illustrates how to use the STARPU_MPI_REDUX mode
+ * and compare it with the standard STARPU_REDUX.
+ *
+ * In order to make this comparison salliant, the init codelet is not
+ * a task that set the handle to a neutral element but rather depends
+ * on the working node.
+ * This is not a proper way to use a reduction pattern however it
+ * can be analogous to the cost/weight of each contribution.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+#include <unistd.h>
+
+static void cl_cpu_read(void *handles[], void*arg)
+{
+	(void) arg;
+	(void) handles;
+}
+
+static struct starpu_codelet read_cl =
+{
+	.cpu_funcs = { cl_cpu_read },
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+	.name = "task_read"
+};
+static void cl_cpu_work(void *handles[], void*arg)
+{
+	(void)arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	starpu_sleep(0.01);
+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	*a = 3.0 + *a + *b;
+	printf("%f\n",*a);
+}
+
+static struct starpu_codelet work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_REDUX, STARPU_R },
+	.name = "task_init"
+};
+
+static struct starpu_codelet mpi_work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
+	.name = "task_init-mpi"
+};
+
+static void cl_cpu_task_init(void *handles[], void*arg)
+{
+	(void) arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	starpu_sleep(0.005);
+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
+#ifdef STARPU_HAVE_VALGRIND_H
+			RUNNING_ON_VALGRIND ? 0. :
+#endif
+			*a);
+	*a = starpu_mpi_world_rank();
+}
+
+static struct starpu_codelet task_init_cl =
+{
+	.cpu_funcs = { cl_cpu_task_init },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+	.name = "task_init"
+};
+
+static void cl_cpu_task_red(void *handles[], void*arg)
+{
+	(void) arg;
+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	starpu_sleep(0.01);
+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	*ad = *ad + *as;
+}
+
+static struct starpu_codelet task_red_cl =
+{
+	.cpu_funcs = { cl_cpu_task_red },
+	.nbuffers = 2,
+	.modes = { STARPU_RW|STARPU_COMMUTE, STARPU_R },
+	.name = "task_red"
+};
+
+int main(int argc, char *argv[])
+{
+	int comm_rank, comm_size;
+	/* Initializes STarPU and the StarPU-MPI layer */
+	starpu_fxt_autostart_profiling(0);
+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
+
+	int nworkers = starpu_cpu_worker_get_count();
+	if (nworkers < 2)
+	{
+		FPRINTF(stderr, "We need at least 2 CPU worker per node.\n");
+		starpu_mpi_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+	printf("there are %d workers\n", nworkers);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
+	if (comm_size < 2)
+	{
+		FPRINTF(stderr, "We need at least 2 nodes.\n");
+		starpu_mpi_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+	double a, b[comm_size];
+	starpu_data_handle_t a_h, b_h[comm_size];
+	double work_coef = 2;
+	enum starpu_data_access_mode task_mode;
+	int wrapup,i,j,work_node;
+	starpu_mpi_tag_t tag = 0;
+	for (wrapup = 0; wrapup <= 2; wrapup ++)
+	{
+		for (i = 0 ; i < 2 ; i++)
+		{
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+			if (i==0)
+				task_mode = STARPU_MPI_REDUX;
+			else
+				task_mode = STARPU_REDUX;
+			if (comm_rank == 0)
+			{
+				a = 1.0;
+				printf("init a = %f\n", a);
+				starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
+				for (j=0;j<comm_size;j++)
+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+			}
+			else
+			{
+				b[comm_rank] = 1.0 / (comm_rank + 1.0);
+				printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+				starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
+				for (j=0;j<comm_size;j++)
+				{
+					if (j == comm_rank)
+						starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
+					else
+						starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+				}
+			}
+			starpu_mpi_data_register(a_h, tag++, 0);
+			for (j=0;j<comm_size;j++)
+				starpu_mpi_data_register(b_h[j], tag++, j);
+
+			starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
+			starpu_fxt_start_profiling();
+			for (work_node=1; work_node < comm_size;work_node++)
+			{
+				for (j=1;j<=work_coef*nworkers;j++)
+				{
+					if (i == 0)
+					    starpu_mpi_task_insert(MPI_COMM_WORLD,
+						&mpi_work_cl,
+						task_mode, a_h,
+						STARPU_R, b_h[work_node],
+						STARPU_EXECUTE_ON_NODE, work_node,
+						0);
+					else
+					    starpu_mpi_task_insert(MPI_COMM_WORLD,
+						&work_cl,
+						task_mode, a_h,
+						STARPU_R, b_h[work_node],
+						STARPU_EXECUTE_ON_NODE, work_node,
+						0);
+				}
+			}
+  			if (wrapup == 0) 
+			{
+				ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data");
+			}
+			else if (wrapup == 1)
+			{
+				starpu_mpi_task_insert(MPI_COMM_WORLD,
+				    &read_cl, STARPU_R, a_h, 0);
+			}
+			starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+			if (comm_rank == 0)
+			{
+				double tmp1 = 0.0;
+				double tmp2 = 0.0;
+				for (work_node = 1; work_node < comm_size ; work_node++) {
+					tmp1 += 1.0 / (work_node + 1.0);
+					tmp2 += (nworkers - 1.0)*work_node*i;
+				}
+				printf("computed result ---> %f expected %f\n", a, 
+					1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2);
+			}
+			starpu_data_unregister(a_h);
+			for (work_node=0; work_node < comm_size;work_node++)
+				starpu_data_unregister(b_h[work_node]);
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+		}
+	}
+	starpu_mpi_shutdown();
+	return 0;
+}
diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index 487dcdea97..0eaa3b7ca0 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -22,6 +22,7 @@
 #include <starpu_mpi_private.h>
 #include <starpu_mpi_cache.h>
 #include <starpu_profiling.h>
+#include <starpu_mpi_task_insert.h>
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_cache.h>
 #include <starpu_mpi_select_node.h>
@@ -645,6 +646,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 int starpu_mpi_wait_for_all(MPI_Comm comm)
 {
+	_starpu_mpi_redux_wrapup_datas();
 	return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
 }
 
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 521e3bea72..33f095f70c 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -21,6 +21,7 @@
 #include <starpu.h>
 #include <starpu_data.h>
 #include <common/utils.h>
+#include <common/uthash.h>
 #include <util/starpu_task_insert_utils.h>
 #include <datawizard/coherency.h>
 #include <core/task.h>
@@ -42,6 +43,17 @@
 
 static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
 
+/* reduction wrap-up */
+// entry in the table
+struct _starpu_redux_data_entry
+{
+	UT_hash_handle hh;
+	starpu_data_handle_t data_handle;
+};
+// the table
+static struct _starpu_redux_data_entry *_redux_data = NULL;
+void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle);
+
 int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *))
 {
 	if (pre_submit_hook)
@@ -651,6 +663,11 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
+		struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
+		if (mpi_data->redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+		{
+			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
+		}
 		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
@@ -690,6 +707,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
 			mpi_data->redux_map [xrank] = 1;
 			mpi_data->redux_map [rrank] = 1;
+			struct _starpu_redux_data_entry *entry;
+			HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry);
+			if (entry == NULL)
+			{
+				_STARPU_MPI_MALLOC(entry, sizeof(*entry));
+				starpu_data_handle_t data_handle = descrs[i].handle;
+				entry->data_handle = data_handle;
+				HASH_ADD_PTR(_redux_data, data_handle, entry);
+			}
 		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
@@ -968,6 +994,7 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han
 	{
 		arity = nb_contrib;
 	}
+	arity = STARPU_MIN(arity,nb_contrib);
 	_STARPU_MPI_DEBUG(5, "There is %d contributors\n", nb_contrib);
 	int contributors[nb_contrib];
 	int reducing_node;
@@ -1074,6 +1101,14 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han
 		current_level++;
 #endif
 	}
+ 	
+	struct _starpu_redux_data_entry *entry;
+	HASH_FIND_PTR(_redux_data, &data_handle, entry);
+	if (entry != NULL) 
+	{
+		HASH_DEL(_redux_data, entry);
+		free(entry);
+	}
 	return 0;
 }
 
@@ -1112,3 +1147,24 @@ int starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	}
 	return starpu_mpi_redux_data_prio_tree(comm, data_handle, prio, nb_contrib);
 }
+
+void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) {
+	size_t data_size = starpu_data_get_size(data_handle);
+ 	// Small data => flat tree | binary tree
+	int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+	starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size);
+	return;
+}
+
+void _starpu_mpi_redux_wrapup_datas() {
+ 	struct _starpu_redux_data_entry *entry = NULL, *tmp = NULL;
+ 	HASH_ITER(hh, _redux_data, entry, tmp)
+	{
+		_starpu_mpi_redux_wrapup_data(entry->data_handle);
+		HASH_DEL(_redux_data, entry);
+		free(entry);
+	}
+ 	return;
+}
+
diff --git a/mpi/src/starpu_mpi_task_insert.h b/mpi/src/starpu_mpi_task_insert.h
index 9453c84f34..cbfda7cdfe 100644
--- a/mpi/src/starpu_mpi_task_insert.h
+++ b/mpi/src/starpu_mpi_task_insert.h
@@ -27,6 +27,7 @@ extern "C"
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank);
 int _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm);
 int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data, int prio);
+void _starpu_mpi_redux_wrapup_datas();
 
 #ifdef __cplusplus
 }
-- 
GitLab


From 790344f05ae797765b90c4136f2577ebb4b7cfe6 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Sun, 20 Nov 2022 15:14:35 +0100
Subject: [PATCH 02/24] free/del_entry is clearer ; fix mpi_reduction test to
 actually use all nodes based on ownership of handles

---
 mpi/src/starpu_mpi.c             |  5 ++++-
 mpi/src/starpu_mpi_task_insert.c | 29 +++++++++++++++++------------
 mpi/tests/mpi_reduction.c        |  1 +
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index 0eaa3b7ca0..3608fd4d54 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -453,7 +453,10 @@ void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
 	_starpu_spin_destroy(&data->coop_lock);
-	free(data->redux_map);
+	if (data->redux_map != NULL)
+	{
+		free(data->redux_map);
+	}
 	data->redux_map = NULL;
 	free(data);
 }
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 33f095f70c..7aaa22f37d 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -703,19 +703,17 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 			int rrank = starpu_mpi_data_get_rank(descrs[i].handle);
 			int size;
 			starpu_mpi_comm_size(comm, &size);
-			if (mpi_data->redux_map == NULL)
-				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
-			mpi_data->redux_map [xrank] = 1;
-			mpi_data->redux_map [rrank] = 1;
-			struct _starpu_redux_data_entry *entry;
-			HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry);
-			if (entry == NULL)
+			if (mpi_data->redux_map == NULL) 
 			{
+				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
+				struct _starpu_redux_data_entry *entry;
 				_STARPU_MPI_MALLOC(entry, sizeof(*entry));
 				starpu_data_handle_t data_handle = descrs[i].handle;
 				entry->data_handle = data_handle;
 				HASH_ADD_PTR(_redux_data, data_handle, entry);
 			}
+			mpi_data->redux_map [xrank] = 1;
+			mpi_data->redux_map [rrank] = 1;
 		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
@@ -943,8 +941,6 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args,
 	*post_sync_jobid = ((const struct _starpu_mpi_redux_data_args *) redux_data_args)->taskC_jobid;
 }
 
-/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
- *  * a data previously accessed in (MPI_)REDUX mode gets accessed in R mode. */
 int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity)
 {
 	int me, rank, nb_nodes;
@@ -1109,6 +1105,11 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han
 		HASH_DEL(_redux_data, entry);
 		free(entry);
 	}
+        if (mpi_data->redux_map != NULL) // it should be allocated at this point anyway
+	{
+	        free(mpi_data->redux_map);
+	}
+	mpi_data->redux_map = NULL;
 	return 0;
 }
 
@@ -1153,7 +1154,13 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) {
  	// Small data => flat tree | binary tree
 	int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2);
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
-	starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size);
+	struct _starpu_redux_data_entry *entry;
+
+	HASH_FIND_PTR(_redux_data, &data_handle, entry);
+	if (entry != NULL)
+	{
+		starpu_mpi_redux_data_tree(mpi_data->node_tag.node.comm,data_handle,_starpu_mpi_redux_tree_size);
+	}
 	return;
 }
 
@@ -1162,8 +1169,6 @@ void _starpu_mpi_redux_wrapup_datas() {
  	HASH_ITER(hh, _redux_data, entry, tmp)
 	{
 		_starpu_mpi_redux_wrapup_data(entry->data_handle);
-		HASH_DEL(_redux_data, entry);
-		free(entry);
 	}
  	return;
 }
diff --git a/mpi/tests/mpi_reduction.c b/mpi/tests/mpi_reduction.c
index fb422a9ece..0fed07352f 100644
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 					       &dot_codelet,
 					       STARPU_R, handles[x],
 					       STARPU_REDUX, dot_handle,
+					       STARPU_EXECUTE_ON_DATA, handles[x],
 					       0);
 		}
 		ret = starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
-- 
GitLab


From 01a32877a95134d9a971f0b9fc7e6445148ee284 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Sun, 20 Nov 2022 18:50:25 +0100
Subject: [PATCH 03/24] get_redux_map accounts for NULL handle

---
 mpi/src/starpu_mpi.c             | 8 ++++++--
 mpi/src/starpu_mpi_private.h     | 2 +-
 mpi/src/starpu_mpi_task_insert.c | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index 3608fd4d54..d9552458af 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -526,8 +526,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
 char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
 {
-	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
-	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
+	if (data == NULL)
+	{
+		return NULL;
+	}
+	struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data);
+	return mpi_data->redux_map;
 }
 
 int starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
diff --git a/mpi/src/starpu_mpi_private.h b/mpi/src/starpu_mpi_private.h
index e43f848395..4e4f3768cc 100644
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -221,7 +221,7 @@ struct _starpu_mpi_data
 	unsigned int modified:1; // Whether the data has been modified since the registration.
 
 	/** Array used to store the contributing nodes to this data
-	  * when it is accessed in REDUX mode. */
+	  * when it is accessed in (MPI_)REDUX mode. */
 	char* redux_map;
 
 	/** Rendez-vous data for opportunistic cooperative sends,
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 7aaa22f37d..8350352de6 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -663,8 +663,8 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
-		if (mpi_data->redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
+		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
 		{
 			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
 		}
-- 
GitLab


From 096ea55f30852fbb70704a1896bca714a8385779 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Mon, 21 Nov 2022 17:07:16 +0100
Subject: [PATCH 04/24] fix brackets ; fix count in mpi_redux and preambule in
 mpi_redux_autowrap

---
 mpi/examples/mpi_redux/mpi_redux.c            | 11 ++++++++---
 mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 15 +++++++--------
 mpi/src/starpu_mpi_task_insert.c              |  6 ++++--
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/mpi/examples/mpi_redux/mpi_redux.c b/mpi/examples/mpi_redux/mpi_redux.c
index c8f8b54c10..d203931bd3 100644
--- a/mpi/examples/mpi_redux/mpi_redux.c
+++ b/mpi/examples/mpi_redux/mpi_redux.c
@@ -190,10 +190,15 @@ int main(int argc, char *argv[])
 		starpu_mpi_barrier(MPI_COMM_WORLD);
 		if (comm_rank == 0)
 		{
-			double tmp = 0.0;
+			double tmp1 = 0.0;
+			double tmp2 = 0.0;
 			for (work_node = 1; work_node < comm_size ; work_node++)
-				tmp += 1.0 / (work_node + 1.0);
-			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
+			{
+				tmp1 += 1.0 / (work_node + 1.0);
+				tmp2 += (nworkers - 1.0)*work_node*i;
+			}
+			printf("computed result ---> %f expected %f\n", a, 
+				1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2);
 		}
 		starpu_data_unregister(a_h);
 		for (work_node=0; work_node < comm_size;work_node++)
diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
index 643fef7777..814725aea6 100644
--- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
+++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
@@ -15,14 +15,12 @@
  */
 
 /*
- * This example illustrates how to use the STARPU_MPI_REDUX mode
- * and compare it with the standard STARPU_REDUX.
+ * This example is similar to mpi_redux.c
  *
- * In order to make this comparison salliant, the init codelet is not
- * a task that set the handle to a neutral element but rather depends
- * on the working node.
- * This is not a proper way to use a reduction pattern however it
- * can be analogous to the cost/weight of each contribution.
+ * It iterates over multiple ways to wrap-up reduction patterns : either by
+ * - waiting for all mpi + tasks
+ * - calling mpi_redux yourself
+ * - inserting a reading task on the handle to reduce 
  */
 
 #include <stdlib.h>
@@ -216,7 +214,8 @@ int main(int argc, char *argv[])
 			{
 				double tmp1 = 0.0;
 				double tmp2 = 0.0;
-				for (work_node = 1; work_node < comm_size ; work_node++) {
+				for (work_node = 1; work_node < comm_size ; work_node++) 
+				{
 					tmp1 += 1.0 / (work_node + 1.0);
 					tmp2 += (nworkers - 1.0)*work_node*i;
 				}
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 8350352de6..1090bb708c 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -1149,7 +1149,8 @@ int starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	return starpu_mpi_redux_data_prio_tree(comm, data_handle, prio, nb_contrib);
 }
 
-void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) {
+void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) 
+{
 	size_t data_size = starpu_data_get_size(data_handle);
  	// Small data => flat tree | binary tree
 	int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2);
@@ -1164,7 +1165,8 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle) {
 	return;
 }
 
-void _starpu_mpi_redux_wrapup_datas() {
+void _starpu_mpi_redux_wrapup_datas() 
+{
  	struct _starpu_redux_data_entry *entry = NULL, *tmp = NULL;
  	HASH_ITER(hh, _redux_data, entry, tmp)
 	{
-- 
GitLab


From bfa3c42c9bab0dcae1c62a4bc1d4215c2de35105 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Wed, 23 Nov 2022 15:22:52 +0100
Subject: [PATCH 05/24] doc

---
 .../chapters/starpu_basics/data_management.doxy        |  9 +++++++++
 .../starpu_installation/environment_variables.doxy     | 10 ++++++++++
 2 files changed, 19 insertions(+)

diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy
index 31aa035271..8e5ddbf7c9 100644
--- a/doc/doxygen/chapters/starpu_basics/data_management.doxy
+++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy
@@ -490,6 +490,15 @@ for (i = 0; i < 100; i++)
 }
 \endcode
 
+starpu_mpi_redux_data() is called automatically in various cases, including
+when a task reading the reduced handle is inserted. The previous example could
+avoid calling starpu_mpi_redux_data. Default priority (0) is used. The
+reduction tree arity is decided based on the size of the data to reduce: a flat tree
+is used with a small data (less than 1024 bytes), a binary tree otherwise. If
+the environment variable STARPU_MPI_REDUX_TREESIZE is setup, then this value is
+read to decide the arity of the reduction tree. Distributed-memory reduction
+patterns are wrapped-up at the end of an application when calling starpu_mpi_wait_for_all().
+
 \section DataCommute Commute Data Access
 
 By default, the implicit dependencies computed from data access use the
diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
index d40c7d0685..2958bc79d7 100644
--- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
+++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
@@ -656,6 +656,16 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default
 1, StarPU-MPI will warn if MPI does not provide the GPUDirect support.
 </dd>
 
+<dt>STARPU_MPI_REDUXTREE_SIZE</dt>
+<dd>
+\anchor STARPU_MPI_REDUXTREE_SIZE
+\addindex __env__STARPU_MPI_REDUXTREE_SIZE
+This variable allows to tune the arity of the distributed-memory reduction trees 
+detected by StarPU. If not set, the arity of the tree depends on the size of the
+data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary
+tree is used otherwise. 
+</dd>
+
 </dl>
 
 
-- 
GitLab


From bf9071df0469c0dfb72f7b2a9715c8813a602624 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Wed, 23 Nov 2022 18:20:38 +0100
Subject: [PATCH 06/24] howto setup

---
 .../chapters/starpu_installation/environment_variables.doxy     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
index 2958bc79d7..789985f81f 100644
--- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
+++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
@@ -663,7 +663,7 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default
 This variable allows to tune the arity of the distributed-memory reduction trees 
 detected by StarPU. If not set, the arity of the tree depends on the size of the
 data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary
-tree is used otherwise. 
+tree is used otherwise. When setting this variable, the same reasoning can be applied. 
 </dd>
 
 </dl>
-- 
GitLab


From 6ea5c9905d8af77a829228ba700dfa66cdc1a525 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 24 Nov 2022 13:54:28 +0100
Subject: [PATCH 07/24] updated automatic arity selection

---
 .../environment_variables.doxy                | 20 +++++++++++--------
 mpi/src/starpu_mpi_task_insert.c              |  7 ++++++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
index 789985f81f..1ca8b51bd2 100644
--- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
+++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
@@ -656,14 +656,18 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default
 1, StarPU-MPI will warn if MPI does not provide the GPUDirect support.
 </dd>
 
-<dt>STARPU_MPI_REDUXTREE_SIZE</dt>
-<dd>
-\anchor STARPU_MPI_REDUXTREE_SIZE
-\addindex __env__STARPU_MPI_REDUXTREE_SIZE
-This variable allows to tune the arity of the distributed-memory reduction trees 
-detected by StarPU. If not set, the arity of the tree depends on the size of the
-data to reduce: a flat tree is used if the data is smaller than 1024 bytes, a binary
-tree is used otherwise. When setting this variable, the same reasoning can be applied. 
+<dt>STARPU_MPI_REDUX_ARITY_THRESHOLD</dt>
+<dd>
+\anchor STARPU_MPI_REDUX_ARITY_THRESHOLD
+\addindex __env__STARPU_MPI_REDUX_ARITY_THRESHOLD
+The arity of the automatically detected reduction trees follows the following
+rule: when the data to be reduced is of small size a flat tree is unrolled
+i.e. all the contributing nodes send their contribution to the root of
+the reduction. When the data to be reduced is of big size, a binary tree is used instead.
+The default threshold between flat and binary tree is 1024 bytes. By setting this
+value to be negative, all the automatically detected reduction trees will use flat trees.
+If this value is set to 0, then binary trees will always be selected. Otherwise,
+the setup value replaces the default 1024.  
 </dd>
 
 </dl>
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 1090bb708c..dd311101a7 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -1153,7 +1153,12 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle)
 {
 	size_t data_size = starpu_data_get_size(data_handle);
  	// Small data => flat tree | binary tree
-	int _starpu_mpi_redux_tree_size = starpu_getenv_number_default("STARPU_MPI_REDUXTREE_SIZE", data_size < 1024 ? STARPU_MAXNODES : 2);
+	int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024);
+	int _starpu_mpi_redux_tree_size = 2;
+	if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold))
+	{
+		_starpu_mpi_redux_tree_size = STARPU_MAXNODES;
+	}
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	struct _starpu_redux_data_entry *entry;
 
-- 
GitLab


From fb88b4920ef6d77a2f497e5d36a462a2aa983167 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Fri, 25 Nov 2022 00:07:57 +0100
Subject: [PATCH 08/24] env variable doc

---
 .../chapters/starpu_basics/data_management.doxy     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy
index 8e5ddbf7c9..b4f25bd8f4 100644
--- a/doc/doxygen/chapters/starpu_basics/data_management.doxy
+++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy
@@ -492,12 +492,15 @@ for (i = 0; i < 100; i++)
 
 starpu_mpi_redux_data() is called automatically in various cases, including
 when a task reading the reduced handle is inserted. The previous example could
-avoid calling starpu_mpi_redux_data. Default priority (0) is used. The
+avoid calling starpu_mpi_redux_data(). Default priority (0) is used. The
 reduction tree arity is decided based on the size of the data to reduce: a flat tree
-is used with a small data (less than 1024 bytes), a binary tree otherwise. If
-the environment variable STARPU_MPI_REDUX_TREESIZE is setup, then this value is
-read to decide the arity of the reduction tree. Distributed-memory reduction
-patterns are wrapped-up at the end of an application when calling starpu_mpi_wait_for_all().
+is used with a small data (default to less than 1024 bytes), a binary tree otherwise. If
+the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the threshold
+between the size of a small data and a bigger data is modified. If the value is setup
+to be negative, flat trees will always be used. If the value is setup to 0, binary
+trees are used. Otherwise, the size of the data is compared to the size in the
+environment variable. Remaining distributed-memory reduction patterns are wrapped-up 
+at the end of an application when calling starpu_mpi_wait_for_all().
 
 \section DataCommute Commute Data Access
 
-- 
GitLab


From 9314cf25c9e3edade62b0b336345c1a3bcc6be8d Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Fri, 18 Nov 2022 19:17:19 +0100
Subject: [PATCH 09/24] initial commit

---
 mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 28 ++++-----
 mpi/src/starpu_mpi_task_insert.c              | 57 +++++++++++--------
 2 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
index 814725aea6..5dee0344e5 100644
--- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
+++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
@@ -183,22 +183,22 @@ int main(int argc, char *argv[])
 				for (j=1;j<=work_coef*nworkers;j++)
 				{
 					if (i == 0)
-					    starpu_mpi_task_insert(MPI_COMM_WORLD,
-						&mpi_work_cl,
-						task_mode, a_h,
-						STARPU_R, b_h[work_node],
-						STARPU_EXECUTE_ON_NODE, work_node,
-						0);
+						starpu_mpi_task_insert(MPI_COMM_WORLD,
+								&mpi_work_cl,
+								task_mode, a_h,
+								STARPU_R, b_h[work_node],
+								STARPU_EXECUTE_ON_NODE, work_node,
+								0);
 					else
-					    starpu_mpi_task_insert(MPI_COMM_WORLD,
-						&work_cl,
-						task_mode, a_h,
-						STARPU_R, b_h[work_node],
-						STARPU_EXECUTE_ON_NODE, work_node,
-						0);
+						starpu_mpi_task_insert(MPI_COMM_WORLD,
+								&work_cl,
+								task_mode, a_h,
+								STARPU_R, b_h[work_node],
+								STARPU_EXECUTE_ON_NODE, work_node,
+								0);
 				}
 			}
-  			if (wrapup == 0) 
+			if (wrapup == 0) 
 			{
 				ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
 				STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data");
@@ -206,7 +206,7 @@ int main(int argc, char *argv[])
 			else if (wrapup == 1)
 			{
 				starpu_mpi_task_insert(MPI_COMM_WORLD,
-				    &read_cl, STARPU_R, a_h, 0);
+						&read_cl, STARPU_R, a_h, 0);
 			}
 			starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 			starpu_mpi_barrier(MPI_COMM_WORLD);
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index dd311101a7..71052dffa9 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -620,6 +620,30 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	if (ret < 0)
 		return ret;
 
+	_STARPU_TRACE_TASK_MPI_PRE_START();
+	/* Send and receive data as requested */
+	for(i=0 ; i<nb_data ; i++)
+	{
+		struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
+		if (mpi_data->redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+		{
+			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
+		}
+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
+	}
+
+	if (xrank_p)
+		*xrank_p = xrank;
+	if (nb_data_p)
+		*nb_data_p = nb_data;
+	if (prio_p)
+		*prio_p = prio;
+
+	if (descrs_p)
+		*descrs_p = descrs;
+	else
+		free(descrs);
+
 	if (do_execute == 1)
 	{
 		va_list varg_list_copy;
@@ -659,30 +683,6 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		}
 	}
 
-	_STARPU_TRACE_TASK_MPI_PRE_START();
-	/* Send and receive data as requested */
-	for(i=0 ; i<nb_data ; i++)
-	{
-		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
-		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
-		{
-			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
-		}
-		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
-	}
-
-	if (xrank_p)
-		*xrank_p = xrank;
-	if (nb_data_p)
-		*nb_data_p = nb_data;
-	if (prio_p)
-		*prio_p = prio;
-
-	if (descrs_p)
-		*descrs_p = descrs;
-	else
-		free(descrs);
-
 	_STARPU_TRACE_TASK_MPI_PRE_END();
 
 	return do_execute;
@@ -714,6 +714,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 			}
 			mpi_data->redux_map [xrank] = 1;
 			mpi_data->redux_map [rrank] = 1;
+			struct _starpu_redux_data_entry *entry;
+			HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry);
+			if (entry == NULL)
+			{
+				_STARPU_MPI_MALLOC(entry, sizeof(*entry));
+				starpu_data_handle_t data_handle = descrs[i].handle;
+				entry->data_handle = data_handle;
+				HASH_ADD_PTR(_redux_data, data_handle, entry);
+			}
 		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
-- 
GitLab


From 9022a65507a3de551ad2538aa731baae495e911e Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Sun, 20 Nov 2022 15:14:35 +0100
Subject: [PATCH 10/24] free/del_entry is clearer ; fix mpi_reduction test to
 actually use all nodes based on ownership of handles

---
 mpi/src/starpu_mpi_task_insert.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 71052dffa9..fd828f91b5 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -718,11 +718,15 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 			HASH_FIND_PTR(_redux_data, &descrs[i].handle, entry);
 			if (entry == NULL)
 			{
+				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
+				struct _starpu_redux_data_entry *entry;
 				_STARPU_MPI_MALLOC(entry, sizeof(*entry));
 				starpu_data_handle_t data_handle = descrs[i].handle;
 				entry->data_handle = data_handle;
 				HASH_ADD_PTR(_redux_data, data_handle, entry);
 			}
+			mpi_data->redux_map [xrank] = 1;
+			mpi_data->redux_map [rrank] = 1;
 		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
-- 
GitLab


From 0fa4485518a0794266437828ba1804ab9b9dec71 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Sun, 20 Nov 2022 18:50:25 +0100
Subject: [PATCH 11/24] get_redux_map accounts for NULL handle

---
 mpi/src/starpu_mpi_task_insert.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index fd828f91b5..e31317b1c3 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -624,8 +624,8 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
-		if (mpi_data->redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
+		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
 		{
 			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
 		}
-- 
GitLab


From cbd223e14455fd2b4a867971822f9606304a66da Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Sat, 26 Nov 2022 15:56:22 +0100
Subject: [PATCH 12/24] int -> size_t relevant

---
 mpi/src/starpu_mpi_task_insert.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index e31317b1c3..b8b1e3d321 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -1166,7 +1166,7 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle)
 {
 	size_t data_size = starpu_data_get_size(data_handle);
  	// Small data => flat tree | binary tree
-	int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024);
+	size_t _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024);
 	int _starpu_mpi_redux_tree_size = 2;
 	if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold))
 	{
-- 
GitLab


From 766d67554d8d4763b917c98fd06052ae3cac17cd Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 1 Dec 2022 16:43:29 +0100
Subject: [PATCH 13/24] updated wrt samuel's review

---
 .../starpu_basics/data_management.doxy        | 32 ++++++++++---------
 .../starpu_extensions/mpi_support.doxy        |  7 ++--
 mpi/src/starpu_mpi.c                          |  8 ++---
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/doc/doxygen/chapters/starpu_basics/data_management.doxy b/doc/doxygen/chapters/starpu_basics/data_management.doxy
index b4f25bd8f4..dfc0038417 100644
--- a/doc/doxygen/chapters/starpu_basics/data_management.doxy
+++ b/doc/doxygen/chapters/starpu_basics/data_management.doxy
@@ -473,12 +473,14 @@ leading to yet more relaxed dependencies and more parallelism.
 
 ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
 case. This will however not produce any MPI communication, but just pass
-::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
-application to call starpu_mpi_redux_data(), which posts tasks which will
-reduce the partial results among MPI nodes into the MPI node which owns the
-data. For instance, some hypothetical application which collects partial results
+::STARPU_REDUX to the underlying starpu_task_insert(). starpu_mpi_redux_data()
+posts tasks which will reduce the partial results among MPI nodes into the MPI 
+node which owns the data. The function can be called by the user to benefit from
+fine-tuning such as priority setting. If the user does not call this function, 
+StarPU wraps up reduction patterns automatically. The following example
+shows a hypothetical application which collects partial results
 into data <c>res</c>, then uses it for other computation, before looping again
-with a new reduction:
+with a new reduction where the wrap-up of the reduction pattern is explicit:
 
 \code{.c}
 for (i = 0; i < 100; i++)
@@ -491,16 +493,16 @@ for (i = 0; i < 100; i++)
 \endcode
 
 starpu_mpi_redux_data() is called automatically in various cases, including
-when a task reading the reduced handle is inserted. The previous example could
-avoid calling starpu_mpi_redux_data(). Default priority (0) is used. The
-reduction tree arity is decided based on the size of the data to reduce: a flat tree
-is used with a small data (default to less than 1024 bytes), a binary tree otherwise. If
-the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the threshold
-between the size of a small data and a bigger data is modified. If the value is setup
-to be negative, flat trees will always be used. If the value is setup to 0, binary
-trees are used. Otherwise, the size of the data is compared to the size in the
-environment variable. Remaining distributed-memory reduction patterns are wrapped-up 
-at the end of an application when calling starpu_mpi_wait_for_all().
+when a task reading the reduced handle is inserted through starpu_mpi_task_insert(). 
+The previous example could avoid calling starpu_mpi_redux_data(). Default priority (0) 
+is used. The reduction tree arity is decided based on the size of the data to reduce: a 
+flat tree is used with a small data (default to less than 1024 bytes), a binary tree 
+otherwise. If the environment variable \ref STARPU_MPI_REDUX_ARITY_THRESHOLD is setup, the 
+threshold between the size of a small data and a bigger data is modified. If the value is
+setup to be negative, flat trees will always be used. If the value is setup to 0, binary 
+trees are used. Otherwise, the size of the data is compared to the size in the environment 
+variable. Remaining distributed-memory reduction patterns are wrapped-up at the end of an 
+application when calling starpu_mpi_wait_for_all().
 
 \section DataCommute Commute Data Access
 
diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
index 5181e291ba..d33fe10009 100644
--- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
+++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
@@ -824,9 +824,10 @@ tells StarPU to spawn only one contribution per contributing node.
 
 The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the initialization and
 reduction codelets should be declared through starpu_data_set_reduction_methods() in the
-same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() has to be called
-when all tasks contributing to the reduction have been inserted so that the reduction tree
-can be submitted. Tasks contributing to the inter-node reduction should be registered as
+same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called 
+either when a task reading the reduced handle is inserted through the MPI layer of StarPU through
+starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed through starpu_mpi_wait_for_all().
+Tasks contributing to the inter-node reduction should be registered as
 accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the
 \c STARPU_REDUX mode, as in the following example.
 
diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index d9552458af..91bdc0aec7 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -453,10 +453,7 @@ void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
 	_starpu_spin_destroy(&data->coop_lock);
-	if (data->redux_map != NULL)
-	{
-		free(data->redux_map);
-	}
+	free(data->redux_map);
 	data->redux_map = NULL;
 	free(data);
 }
@@ -530,6 +527,7 @@ char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
 	{
 		return NULL;
 	}
+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
 	struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data);
 	return mpi_data->redux_map;
 }
@@ -653,6 +651,8 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 int starpu_mpi_wait_for_all(MPI_Comm comm)
 {
+	/* If the user forgets to call mpi_redux_data or insert R tasks on the reduced handles */
+	/* then, we wrap reduction patterns for them. This is typical of benchmarks */
 	_starpu_mpi_redux_wrapup_datas();
 	return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
 }
-- 
GitLab


From e300dbc80b639623c3265388b8dd9b0d5bc0b712 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 1 Dec 2022 18:45:10 +0100
Subject: [PATCH 14/24] fix redux_map use

---
 mpi/src/starpu_mpi.c             |  3 +--
 mpi/src/starpu_mpi_task_insert.c | 11 +----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index 91bdc0aec7..300f79a2a3 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -523,11 +523,10 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
 char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
 {
-	if (data == NULL)
+	if (data == NULL || data->mpi_data == NULL)
 	{
 		return NULL;
 	}
-	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
 	struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data);
 	return mpi_data->redux_map;
 }
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index b8b1e3d321..2ef727fbaf 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -624,11 +624,6 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
-		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
-		{
-			_starpu_mpi_redux_wrapup_data(descrs[i].handle);			
-		}
 		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
@@ -719,7 +714,6 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 			if (entry == NULL)
 			{
 				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
-				struct _starpu_redux_data_entry *entry;
 				_STARPU_MPI_MALLOC(entry, sizeof(*entry));
 				starpu_data_handle_t data_handle = descrs[i].handle;
 				entry->data_handle = data_handle;
@@ -1118,10 +1112,7 @@ int starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_han
 		HASH_DEL(_redux_data, entry);
 		free(entry);
 	}
-        if (mpi_data->redux_map != NULL) // it should be allocated at this point anyway
-	{
-	        free(mpi_data->redux_map);
-	}
+	free(mpi_data->redux_map);
 	mpi_data->redux_map = NULL;
 	return 0;
 }
-- 
GitLab


From 9b2a19ef385ac1b8b5156dc7baf36124a3661aba Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 1 Dec 2022 19:13:30 +0100
Subject: [PATCH 15/24] fix redux before exchange data

---
 mpi/src/starpu_mpi_task_insert.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 2ef727fbaf..7001addd35 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -624,9 +624,15 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
+		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
+		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+		{
+			_starpu_mpi_redux_wrapup_data(descrs[i].handle);
+		}
 		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
+
 	if (xrank_p)
 		*xrank_p = xrank;
 	if (nb_data_p)
-- 
GitLab


From 2ba8df21e16fe3581aa459f5e69d2c63443d3a11 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 1 Dec 2022 19:15:46 +0100
Subject: [PATCH 16/24] size_t -> int but cast if int is unsigned

---
 mpi/src/starpu_mpi_task_insert.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 7001addd35..2221659215 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -1163,9 +1163,9 @@ void _starpu_mpi_redux_wrapup_data(starpu_data_handle_t data_handle)
 {
 	size_t data_size = starpu_data_get_size(data_handle);
  	// Small data => flat tree | binary tree
-	size_t _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024);
+	int _starpu_mpi_redux_threshold = starpu_getenv_number_default("STARPU_MPI_REDUX_ARITY_THRESHOLD", 1024);
 	int _starpu_mpi_redux_tree_size = 2;
-	if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < _starpu_mpi_redux_threshold))
+	if (_starpu_mpi_redux_threshold < 0 || (_starpu_mpi_redux_threshold > 0 && data_size < (size_t) _starpu_mpi_redux_threshold))
 	{
 		_starpu_mpi_redux_tree_size = STARPU_MAXNODES;
 	}
-- 
GitLab


From 6b97adfa1337c980ee2b201a64f89c77d4e86d0b Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Thu, 1 Dec 2022 23:05:51 +0100
Subject: [PATCH 17/24] exchange before, before free descrs

---
 mpi/src/starpu_mpi.c             |  8 ++------
 mpi/src/starpu_mpi_task_insert.c | 10 ++++++----
 mpi/tests/policy_register.c      |  1 -
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/mpi/src/starpu_mpi.c b/mpi/src/starpu_mpi.c
index 300f79a2a3..5bb0e5e060 100644
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -523,12 +523,8 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
 char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
 {
-	if (data == NULL || data->mpi_data == NULL)
-	{
-		return NULL;
-	}
-	struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data);
-	return mpi_data->redux_map;
+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
 }
 
 int starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
diff --git a/mpi/src/starpu_mpi_task_insert.c b/mpi/src/starpu_mpi_task_insert.c
index 2221659215..12a6b16d52 100644
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -624,15 +624,17 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
-		if (redux_map != NULL  && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+                if (descrs[i].handle && descrs[i].handle->mpi_data)
 		{
-			_starpu_mpi_redux_wrapup_data(descrs[i].handle);
+			char* redux_map = starpu_mpi_data_get_redux_map(descrs[i].handle);
+			if (redux_map != NULL && descrs[i].mode & STARPU_R && ( descrs[i].mode & ~ STARPU_REDUX || descrs[i].mode & ~ STARPU_MPI_REDUX ))
+			{
+				_starpu_mpi_redux_wrapup_data(descrs[i].handle);
+			}
 		}
 		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
-
 	if (xrank_p)
 		*xrank_p = xrank;
 	if (nb_data_p)
diff --git a/mpi/tests/policy_register.c b/mpi/tests/policy_register.c
index 6b7af3c187..6549f74555 100644
--- a/mpi/tests/policy_register.c
+++ b/mpi/tests/policy_register.c
@@ -92,7 +92,6 @@ int main(int argc, char **argv)
 
 	policy = starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_my_policy_1);
 	starpu_mpi_node_selection_set_current_policy(policy);
-
 	task = starpu_mpi_task_build(MPI_COMM_WORLD, &mycodelet,
 				     STARPU_W, handles[0], STARPU_W, handles[1],
 				     0);
-- 
GitLab


From 9f9799968eddb4e37795c650cef61575ac19e348 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Fri, 2 Dec 2022 10:47:47 +0100
Subject: [PATCH 18/24] automatic/user precision

---
 doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
index d33fe10009..9ee4dd511e 100644
--- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
+++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
@@ -826,7 +826,9 @@ The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the ini
 reduction codelets should be declared through starpu_data_set_reduction_methods() in the
 same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called 
 either when a task reading the reduced handle is inserted through the MPI layer of StarPU through
-starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed through starpu_mpi_wait_for_all().
+starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed 
+through starpu_mpi_wait_for_all(). The function can be called by the user to fine-tune arguments
+such as the priority of the reduction tasks.
 Tasks contributing to the inter-node reduction should be registered as
 accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the
 \c STARPU_REDUX mode, as in the following example.
-- 
GitLab


From 84d9efc1d19bcb47b826fcb6fed01683e8493c56 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Mon, 5 Dec 2022 14:06:02 +0100
Subject: [PATCH 19/24] fix link + revert policy_register

---
 doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 2 +-
 mpi/tests/policy_register.c                             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
index 9ee4dd511e..a2dad06d7e 100644
--- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
+++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
@@ -822,7 +822,7 @@ core on contributing nodes spawns their own copy to work with. In the case that
 required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX
 tells StarPU to spawn only one contribution per contributing node.
 
-The setup and use of \c STARPU_MPI_REDUX is similar to \c STARPU_REDUX : the initialization and
+The setup and use of :: STARPU_MPI_REDUX is similar to :: STARPU_REDUX : the initialization and
 reduction codelets should be declared through starpu_data_set_reduction_methods() in the
 same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called 
 either when a task reading the reduced handle is inserted through the MPI layer of StarPU through
diff --git a/mpi/tests/policy_register.c b/mpi/tests/policy_register.c
index 6549f74555..6b7af3c187 100644
--- a/mpi/tests/policy_register.c
+++ b/mpi/tests/policy_register.c
@@ -92,6 +92,7 @@ int main(int argc, char **argv)
 
 	policy = starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_my_policy_1);
 	starpu_mpi_node_selection_set_current_policy(policy);
+
 	task = starpu_mpi_task_build(MPI_COMM_WORLD, &mycodelet,
 				     STARPU_W, handles[0], STARPU_W, handles[1],
 				     0);
-- 
GitLab


From 6e54c9ae736a223d23e1ca0c7fba716df4173c1a Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Mon, 5 Dec 2022 16:23:00 +0100
Subject: [PATCH 20/24] ':: '->'::'

---
 doc/doxygen/chapters/starpu_extensions/mpi_support.doxy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
index a2dad06d7e..21805e7ed1 100644
--- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
+++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
@@ -822,7 +822,7 @@ core on contributing nodes spawns their own copy to work with. In the case that
 required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX
 tells StarPU to spawn only one contribution per contributing node.
 
-The setup and use of :: STARPU_MPI_REDUX is similar to :: STARPU_REDUX : the initialization and
+The setup and use of ::STARPU_MPI_REDUX is similar to ::STARPU_REDUX : the initialization and
 reduction codelets should be declared through starpu_data_set_reduction_methods() in the
 same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called 
 either when a task reading the reduced handle is inserted through the MPI layer of StarPU through
-- 
GitLab


From b734b0768147ea73f71616ba12f32c80482026ff Mon Sep 17 00:00:00 2001
From: Nathalie Furmento <nathalie.furmento@labri.fr>
Date: Mon, 5 Dec 2022 16:29:59 +0100
Subject: [PATCH 21/24] doc: small typos

---
 .../chapters/starpu_extensions/mpi_support.doxy    | 14 +++++++-------
 .../starpu_installation/environment_variables.doxy |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
index 21805e7ed1..eb50d92d15 100644
--- a/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
+++ b/doc/doxygen/chapters/starpu_extensions/mpi_support.doxy
@@ -817,21 +817,21 @@ The data can then be used just like per-node above.
 \section MPIMpiRedux Inter-node reduction
 
 One might want to leverage a reduction pattern across several nodes.
-Using \c STARPU_REDUX (see \ref DataReduction), one can obtain such patterns where each
+Using ::STARPU_REDUX (see \ref DataReduction), one can obtain such patterns where each
 core on contributing nodes spawns their own copy to work with. In the case that the
-required reductions are too numerous and expensive, the access mode \c STARPU_MPI_REDUX
+required reductions are too numerous and expensive, the access mode ::STARPU_MPI_REDUX
 tells StarPU to spawn only one contribution per contributing node.
 
 The setup and use of ::STARPU_MPI_REDUX is similar to ::STARPU_REDUX : the initialization and
 reduction codelets should be declared through starpu_data_set_reduction_methods() in the
-same fashion as \c STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called 
+same fashion as ::STARPU_REDUX. The function starpu_mpi_redux_data() is automatically called
 either when a task reading the reduced handle is inserted through the MPI layer of StarPU through
-starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed 
+starpu_mpi_insert_task() or when the user waits for all communications and tasks to be executed
 through starpu_mpi_wait_for_all(). The function can be called by the user to fine-tune arguments
 such as the priority of the reduction tasks.
 Tasks contributing to the inter-node reduction should be registered as
-accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode, as for the
-\c STARPU_REDUX mode, as in the following example.
+accessing the contribution through ::STARPU_RW|::STARPU_COMMUTE mode, as for the
+::STARPU_REDUX mode, as in the following example.
 
 \code{.c}
 static struct starpu_codelet contrib_cl =
@@ -853,7 +853,7 @@ starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, STAR
 
 Note that if the specified node is set to \c -1, the option is ignored.
 
-More examples are available at mpi/examples/mpi_redux/mpi_redux.c and mpi/examples/mpi_redux/mpi_redux_tree.c.
+More examples are available at \c mpi/examples/mpi_redux/mpi_redux.c and \c mpi/examples/mpi_redux/mpi_redux_tree.c.
 
 \section MPIPriorities Priorities
 
diff --git a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
index 1ca8b51bd2..b234925dd7 100644
--- a/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
+++ b/doc/doxygen/chapters/starpu_installation/environment_variables.doxy
@@ -660,14 +660,14 @@ This variable allows to enable (1) MPI GPUDirect support or not (0). The default
 <dd>
 \anchor STARPU_MPI_REDUX_ARITY_THRESHOLD
 \addindex __env__STARPU_MPI_REDUX_ARITY_THRESHOLD
-The arity of the automatically detected reduction trees follows the following
+The arity of the automatically-detected reduction trees follows the following
 rule: when the data to be reduced is of small size a flat tree is unrolled
 i.e. all the contributing nodes send their contribution to the root of
 the reduction. When the data to be reduced is of big size, a binary tree is used instead.
-The default threshold between flat and binary tree is 1024 bytes. By setting this
-value to be negative, all the automatically detected reduction trees will use flat trees.
+The default threshold between flat and binary tree is 1024 bytes. By setting the environment
+variable with a negative value, all the automatically detected reduction trees will use flat trees.
 If this value is set to 0, then binary trees will always be selected. Otherwise,
-the setup value replaces the default 1024.  
+the setup value replaces the default 1024.
 </dd>
 
 </dl>
-- 
GitLab


From 633ef8e0ab51f28f104308eeebe00692eab87c1c Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Mon, 5 Dec 2022 16:46:37 +0100
Subject: [PATCH 22/24] changelog: small change _ automatic wrapup / small
 feature _ env var ARITY THRESHOLD

---
 ChangeLog | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 68972eaf94..4580c1b26b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -130,6 +130,8 @@ Small features:
   * Add STARPU_SIMGRID_TASK_PUSH_COST environment variable.
   * Add starpu_memory_nodes_get_count_by_kind and
     starpu_memory_node_get_ids_by_type.
+  * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of three used in
+    distributed-memory reduction patterns that are automatically detected.
 
 Changes:
   * The redux codelet should expose the STARPU_COMMUTE flag, since StarPU
@@ -161,6 +163,8 @@ Small changes:
     unsigned to int, to explicit that it may be -1.
   * Value 0 for STARPU_MPI_NDETACHED_SEND and STARPU_MPI_NREADY_PROCESS will
     now disable their behaviour.
+  * Distrbuted-memory reduction patterns are automatically wrapped-up if the user
+    do not call starpu_mpi_redux_data
 
 StarPU 1.3.10
 ====================================================================
-- 
GitLab


From 1571eef57554dd0de2edc0a1b086a706269ef4c7 Mon Sep 17 00:00:00 2001
From: Antoine JEGO <antoine.jego@etu.enseeiht.fr>
Date: Mon, 5 Dec 2022 16:50:08 +0100
Subject: [PATCH 23/24] typo

---
 ChangeLog | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4580c1b26b..7bba22233e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -130,7 +130,7 @@ Small features:
   * Add STARPU_SIMGRID_TASK_PUSH_COST environment variable.
   * Add starpu_memory_nodes_get_count_by_kind and
     starpu_memory_node_get_ids_by_type.
-  * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of three used in
+  * Add STARPU_MPI_REDUX_ARITY_THRESHOLD to tune the type of tree used in
     distributed-memory reduction patterns that are automatically detected.
 
 Changes:
@@ -163,8 +163,8 @@ Small changes:
     unsigned to int, to explicit that it may be -1.
   * Value 0 for STARPU_MPI_NDETACHED_SEND and STARPU_MPI_NREADY_PROCESS will
     now disable their behaviour.
-  * Distrbuted-memory reduction patterns are automatically wrapped-up if the user
-    do not call starpu_mpi_redux_data
+  * Distributed-memory reduction patterns are automatically wrapped-up if the user
+    do not call starpu_mpi_redux_data()
 
 StarPU 1.3.10
 ====================================================================
-- 
GitLab


From dd047ad65b6935a6e1580d40bfe55a2889761f74 Mon Sep 17 00:00:00 2001
From: Nathalie Furmento <nathalie.furmento@labri.fr>
Date: Wed, 4 Jan 2023 12:47:09 +0100
Subject: [PATCH 24/24] mpi/examples/mpi_redux/: fix printing

---
 mpi/examples/mpi_redux/mpi_redux.c            | 16 ++++++-------
 mpi/examples/mpi_redux/mpi_redux_autowrapup.c | 24 +++++++++----------
 mpi/examples/mpi_redux/mpi_redux_tree.c       | 16 ++++++-------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/mpi/examples/mpi_redux/mpi_redux.c b/mpi/examples/mpi_redux/mpi_redux.c
index d203931bd3..bb445dff67 100644
--- a/mpi/examples/mpi_redux/mpi_redux.c
+++ b/mpi/examples/mpi_redux/mpi_redux.c
@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016-2022  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2016-2023  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,9 +40,9 @@ static void cl_cpu_work(void *handles[], void*arg)
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
 	*a = 3.0 + *a + *b;
-	printf("%f\n",*a);
+	FPRINTF(stderr, "%f\n",*a);
 }
 
 static struct starpu_codelet work_cl =
@@ -66,7 +66,7 @@ static void cl_cpu_task_init(void *handles[], void*arg)
 	(void) arg;
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	starpu_sleep(0.005);
-	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
+	FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
 #ifdef STARPU_HAVE_VALGRIND_H
 			RUNNING_ON_VALGRIND ? 0. :
 #endif
@@ -88,7 +88,7 @@ static void cl_cpu_task_red(void *handles[], void*arg)
 	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
 	*ad = *ad + *as;
 }
 
@@ -140,7 +140,7 @@ int main(int argc, char *argv[])
 		if (comm_rank == 0)
 		{
 			a = 1.0;
-			printf("init a = %f\n", a);
+			FPRINTF(stderr, "init a = %f\n", a);
 			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
 			for (j=0;j<comm_size;j++)
 				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
@@ -148,7 +148,7 @@ int main(int argc, char *argv[])
 		else
 		{
 			b[comm_rank] = 1.0 / (comm_rank + 1.0);
-			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+			FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]);
 			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
 			for (j=0;j<comm_size;j++)
 			{
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
 				tmp1 += 1.0 / (work_node + 1.0);
 				tmp2 += (nworkers - 1.0)*work_node*i;
 			}
-			printf("computed result ---> %f expected %f\n", a, 
+			FPRINTF(stderr, "computed result ---> %f expected %f\n", a, 
 				1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2);
 		}
 		starpu_data_unregister(a_h);
diff --git a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
index 5dee0344e5..386c892c1f 100644
--- a/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
+++ b/mpi/examples/mpi_redux/mpi_redux_autowrapup.c
@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016-2022  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2016-2023  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
  * It iterates over multiple ways to wrap-up reduction patterns : either by
  * - waiting for all mpi + tasks
  * - calling mpi_redux yourself
- * - inserting a reading task on the handle to reduce 
+ * - inserting a reading task on the handle to reduce
  */
 
 #include <stdlib.h>
@@ -51,9 +51,9 @@ static void cl_cpu_work(void *handles[], void*arg)
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
 	*a = 3.0 + *a + *b;
-	printf("%f\n",*a);
+	FPRINTF(stderr, "%f\n",*a);
 }
 
 static struct starpu_codelet work_cl =
@@ -77,7 +77,7 @@ static void cl_cpu_task_init(void *handles[], void*arg)
 	(void) arg;
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	starpu_sleep(0.005);
-	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
+	FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
 #ifdef STARPU_HAVE_VALGRIND_H
 			RUNNING_ON_VALGRIND ? 0. :
 #endif
@@ -99,7 +99,7 @@ static void cl_cpu_task_red(void *handles[], void*arg)
 	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
 	*ad = *ad + *as;
 }
 
@@ -126,7 +126,7 @@ int main(int argc, char *argv[])
 		starpu_mpi_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
-	printf("there are %d workers\n", nworkers);
+	FPRINTF(stderr, "there are %d workers\n", nworkers);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
 	if (comm_size < 2)
 	{
@@ -154,7 +154,7 @@ int main(int argc, char *argv[])
 			if (comm_rank == 0)
 			{
 				a = 1.0;
-				printf("init a = %f\n", a);
+				FPRINTF(stderr, "init a = %f\n", a);
 				starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
 				for (j=0;j<comm_size;j++)
 					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
@@ -162,7 +162,7 @@ int main(int argc, char *argv[])
 			else
 			{
 				b[comm_rank] = 1.0 / (comm_rank + 1.0);
-				printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+				FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]);
 				starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
 				for (j=0;j<comm_size;j++)
 				{
@@ -198,7 +198,7 @@ int main(int argc, char *argv[])
 								0);
 				}
 			}
-			if (wrapup == 0) 
+			if (wrapup == 0)
 			{
 				ret = starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
 				STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_redux_data");
@@ -214,12 +214,12 @@ int main(int argc, char *argv[])
 			{
 				double tmp1 = 0.0;
 				double tmp2 = 0.0;
-				for (work_node = 1; work_node < comm_size ; work_node++) 
+				for (work_node = 1; work_node < comm_size ; work_node++)
 				{
 					tmp1 += 1.0 / (work_node + 1.0);
 					tmp2 += (nworkers - 1.0)*work_node*i;
 				}
-				printf("computed result ---> %f expected %f\n", a, 
+				FPRINTF(stderr, "computed result ---> %f expected %f\n", a,
 					1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp1) + tmp2);
 			}
 			starpu_data_unregister(a_h);
diff --git a/mpi/examples/mpi_redux/mpi_redux_tree.c b/mpi/examples/mpi_redux/mpi_redux_tree.c
index 0dc301f3ae..7588d97a85 100644
--- a/mpi/examples/mpi_redux/mpi_redux_tree.c
+++ b/mpi/examples/mpi_redux/mpi_redux_tree.c
@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016-2022  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2016-2023  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,9 +40,9 @@ static void cl_cpu_work(void *handles[], void*arg)
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	FPRINTF(stderr, "work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
 	*a = 3.0 + *a + *b;
-	printf("%f\n",*a);
+	FPRINTF(stderr, "%f\n",*a);
 }
 
 static struct starpu_codelet work_cl =
@@ -66,7 +66,7 @@ static void cl_cpu_task_init(void *handles[], void*arg)
 	(void) arg;
 	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	starpu_sleep(0.005);
-	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
+	FPRINTF(stderr, "init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(),
 #ifdef STARPU_HAVE_VALGRIND_H
 			RUNNING_ON_VALGRIND ? 0. :
 #endif
@@ -88,7 +88,7 @@ static void cl_cpu_task_red(void *handles[], void*arg)
 	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
 	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
 	starpu_sleep(0.01);
-	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	FPRINTF(stderr, "red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
 	*ad = *ad + *as;
 }
 
@@ -131,7 +131,7 @@ int main(int argc, char *argv[])
 		if (comm_rank == 0)
 		{
 			a = 1.0;
-			printf("init a = %f\n", a);
+			FPRINTF(stderr, "init a = %f\n", a);
 			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
 			for (j=0;j<comm_size;j++)
 				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
@@ -139,7 +139,7 @@ int main(int argc, char *argv[])
 		else
 		{
 			b[comm_rank] = 1.0 / (comm_rank + 1.0);
-			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+			FPRINTF(stderr, "init b_%d = %f\n", comm_rank, b[comm_rank]);
 			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
 			for (j=0;j<comm_size;j++)
 			{
@@ -176,7 +176,7 @@ int main(int argc, char *argv[])
 			double tmp = 0.0;
 			for (work_node = 1; work_node < comm_size ; work_node++)
 				tmp += 1.0 / (work_node + 1.0);
-			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
+			FPRINTF(stderr, "computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
 		}
 		starpu_data_unregister(a_h);
 		for (work_node=0; work_node < comm_size;work_node++)
-- 
GitLab