From 16b7c4ef5ba6d4459403d219cf93c6be523b3280 Mon Sep 17 00:00:00 2001
From: Florent Pruvost <florent.pruvost@inria.fr>
Date: Tue, 19 Mar 2024 16:19:20 +0100
Subject: [PATCH] Add an example + utest for using MPI subcommunicators

---
 example/CMakeLists.txt     |  14 ++-
 example/mpi/CMakeLists.txt |  37 +++++++
 example/mpi/comm_split.c   | 205 +++++++++++++++++++++++++++++++++++++
 example/mpi/comm_split.h   | 153 +++++++++++++++++++++++++++
 4 files changed, 404 insertions(+), 5 deletions(-)
 create mode 100644 example/mpi/CMakeLists.txt
 create mode 100644 example/mpi/comm_split.c
 create mode 100644 example/mpi/comm_split.h

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 9619f8073..4681c18ae 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -27,12 +27,16 @@ if (CHAMELEON_SIMULATION)
 endif()
 
 if (CHAMELEON_PREC_D)
-    add_subdirectory(lapack_to_chameleon)
+  add_subdirectory(lapack_to_chameleon)
 else()
-    message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_chameleon "
-    "and out_core tutorials cannot be built (use only double arithmetic "
-    "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build "
-    "executables of this tutorial.")
+  message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_chameleon "
+  "and out_core tutorials cannot be built (use only double arithmetic "
+  "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build "
+  "executables of this tutorial.")
+endif()
+
+if(CHAMELEON_USE_MPI AND MPI_C_FOUND AND CHAMELEON_SCHED_STARPU)
+  add_subdirectory(mpi)
 endif()
 
 ###
diff --git a/example/mpi/CMakeLists.txt b/example/mpi/CMakeLists.txt
new file mode 100644
index 000000000..c06e975b7
--- /dev/null
+++ b/example/mpi/CMakeLists.txt
@@ -0,0 +1,37 @@
+###
+#
+# @file CMakeLists.txt
+#
+# @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+#                 Univ. Bordeaux. All rights reserved.
+#
+###
+#
+#  CHAMELEON example routines
+#  CHAMELEON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+#  University of Bordeaux, Bordeaux INP
+#
+# @version 1.3.0
+#  @author Florent Pruvost
+#  @author Mathieu Faverge
+#  @date 2024-03-19
+#
+###
+
+set(MPICMD mpiexec --bind-to none -n 4)
+
+set(SOURCES
+    comm_split.c
+   )
+
+foreach(_src ${SOURCES})
+  get_filename_component(_name_exe ${_src} NAME_WE)
+  add_executable(${_name_exe} ${_src})
+  target_link_libraries(${_name_exe} PRIVATE chameleon coreblas MORSE::LAPACKE)
+  install(TARGETS ${_name_exe} DESTINATION bin/chameleon/mpi)
+  add_test(example_mpi_${_name_exe} ${MPICMD} ./${_name_exe})
+endforeach()
+
+###
+### END CMakeLists.txt
+###
diff --git a/example/mpi/comm_split.c b/example/mpi/comm_split.c
new file mode 100644
index 000000000..834582be6
--- /dev/null
+++ b/example/mpi/comm_split.c
@@ -0,0 +1,205 @@
+/**
+ *
+ * @file comm_split.c
+ *
+ * @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                 Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon comm_split example
+ *
+ * @version 1.3.0
+ * @author Florent Pruvost
+ * @author Mathieu Faverge
+ * @date 2024-03-19
+ *
+ */
+#include "comm_split.h"
+
+/*
+ * @brief comm_split introduces how to use CHAMELEON with MPI
+ * subcommunicators.
+ * @details This example shows that Chameleon can be used with custom MPI
+ * communicators (different from MPI_COMM_WORLD). Here two different algorithms
+ * (potrf and getrf_nopiv) are called at the same time on two different
+ * communicators A (0, 2) and B (1, 3). To use this program properly CHAMELEON
+ * must use StarPU Runtime system and MPI option must be activated at
+ * configure. This program is meant to be run with 4 MPI processes and the data
+ * distribution on matrices is 2D block cyclic P=2, Q=1.
+ */
+int main(int argc, char *argv[]) {
+
+  /* Check that MPI has threads support */
+  int thread_support;
+  if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+  {
+    fprintf(stderr,"MPI_Init_thread failed\n");
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+  }
+  if (thread_support == MPI_THREAD_FUNNELED)
+  	fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+  if (thread_support < MPI_THREAD_FUNNELED)
+  	fprintf(stderr,"Warning: MPI does not have thread support!\n");
+
+  /* Check that 4 MPI processes are used */
+  int comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
+  if(comm_size != 4)
+  {
+      printf("This application is meant to be run with 4 MPI processes, not %d.\n", comm_size);
+      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+  }
+
+  /* Get my rank in the global communicator */
+  int my_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+
+  /* Determine the colour and key based on whether my rank is even. */
+  char subcommunicator;
+  int colour;
+  int key = my_rank/2;
+  if(my_rank % 2 == 0)
+  {
+      subcommunicator = 'A';
+      colour = 0;
+  }
+  else
+  {
+      subcommunicator = 'B';
+      colour = 1;
+  }
+
+  /* Split the global communicator */
+  MPI_Comm new_comm;
+  MPI_Comm_split(MPI_COMM_WORLD, colour, key, &new_comm);
+
+  /* Get my rank in the new communicator */
+  int my_new_comm_rank;
+  MPI_Comm_rank(new_comm, &my_new_comm_rank);
+
+  /* Print my new rank and new communicator */
+  printf("[MPI process %d] MPI process %d in subcommunicator %c.\n", my_rank, my_new_comm_rank, subcommunicator);
+
+  /* initialize some parameters with default values */
+  int iparam[IPARAM_SIZEOF];
+  memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
+  init_iparam(iparam);
+
+  /* read arguments */
+  read_args(argc, argv, iparam);
+  int N    = iparam[IPARAM_N]; // matrix order
+  int NB   = iparam[IPARAM_NB]; // number of rows and columns in tiles
+  int NRHS = iparam[IPARAM_NRHS]; // number of RHS vectors
+  int NCPU = iparam[IPARAM_NCPU]; // number of CPU cores to use
+  int P = 2;
+  int Q = 1;
+
+  /* Initialize CHAMELEON with custom communicator */
+  CHAMELEON_InitParComm( NCPU, 0, 1, new_comm );
+
+  CHAMELEON_Set(CHAMELEON_TILE_SIZE, iparam[IPARAM_NB] );
+
+  /* declarations to time the program and evaluate performances */
+  double flops, gflops, cpu_time;
+
+  if(my_rank % 2 == 0)
+  {
+    /* Cholesky on subcommunicator A i.e. 0,2 */
+
+    /* Initialize matrices */
+    CHAM_desc_t *descA, *descAC, *descB, *descX;
+
+    CHAMELEON_Desc_Create( &descA, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q );
+    CHAMELEON_Desc_Create( &descAC, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q );
+    CHAMELEON_Desc_Create( &descB, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q );
+    CHAMELEON_Desc_Create( &descX, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q );
+
+    CHAMELEON_dplgsy_Tile( (double)N, ChamUpperLower, descA, 20 );
+    CHAMELEON_dplrnt_Tile( descB, 21 );
+
+    CHAMELEON_dlacpy_Tile( ChamUpperLower, descA, descAC);
+    CHAMELEON_dlacpy_Tile( ChamUpperLower, descB, descX);
+
+    cpu_time = -CHAMELEON_timer();
+
+    /* Solve AX = B by Cholesky factorization */
+    CHAMELEON_dposv_Tile( ChamLower, descA, descX );
+
+    cpu_time += CHAMELEON_timer();
+
+    flops = flops_dpotrf( N ) + flops_dpotrs( N, NRHS );
+    gflops = flops * 1.e-9 / cpu_time;
+    if ( CHAMELEON_Comm_rank() == 0 ) {
+        printf( "\nCholesky performances on subcommunicator %c:\n", subcommunicator);
+        print_header( argv[0], iparam);
+        printf( "%9.3f %9.2f\n", cpu_time, gflops);
+    }
+    fflush( stdout );
+
+    /* compute norms to check the result */
+    check( descAC, descX, descB );
+
+    CHAMELEON_Desc_Destroy( &descX );
+    CHAMELEON_Desc_Destroy( &descAC );
+    CHAMELEON_Desc_Destroy( &descB );
+    CHAMELEON_Desc_Destroy( &descA );
+  }
+  else
+  {
+    /* LU nopiv on subcommunicator B i.e. 1,3 */
+
+    /* Initialize matrices */
+    CHAM_desc_t *descA, *descAC, *descB, *descX;
+
+    CHAMELEON_Desc_Create( &descA, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q );
+    CHAMELEON_Desc_Create( &descAC, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q );
+    CHAMELEON_Desc_Create( &descB, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q );
+    CHAMELEON_Desc_Create( &descX, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble,
+                           NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q );
+
+    CHAMELEON_dplgtr_Tile( 0        , ChamUpper, descA, 10 );
+    CHAMELEON_dplgtr_Tile( (double)N, ChamLower, descA, 11 );
+    CHAMELEON_dplrnt_Tile( descB, 12 );
+
+    CHAMELEON_dlacpy_Tile( ChamUpperLower, descA, descAC);
+    CHAMELEON_dlacpy_Tile( ChamUpperLower, descB, descX);
+
+    cpu_time = -CHAMELEON_timer();
+
+    /* Solve AX = B by LU factorization */
+    CHAMELEON_dgesv_nopiv_Tile( descAC, descX );
+
+    cpu_time += CHAMELEON_timer();
+
+    flops = flops_dgetrf( N, N ) + flops_dgetrs( N, NRHS );
+    gflops = flops * 1.e-9 / cpu_time;
+    if ( CHAMELEON_Comm_rank() == 0 ) {
+        printf( "\nLU nopiv performances on subcommunicator %c:\n", subcommunicator);
+        print_header( argv[0], iparam);
+        printf( "%9.3f %9.2f\n", cpu_time, gflops);
+    }
+    fflush( stdout );
+
+    /* compute norms to check the result */
+    check( descA, descX, descB );
+
+    CHAMELEON_Desc_Destroy( &descX );
+    CHAMELEON_Desc_Destroy( &descAC );
+    CHAMELEON_Desc_Destroy( &descB );
+    CHAMELEON_Desc_Destroy( &descA );
+  }
+
+  CHAMELEON_Finalize();
+
+  MPI_Finalize();
+
+  return EXIT_SUCCESS;
+}
diff --git a/example/mpi/comm_split.h b/example/mpi/comm_split.h
new file mode 100644
index 000000000..ecbd19f5a
--- /dev/null
+++ b/example/mpi/comm_split.h
@@ -0,0 +1,153 @@
+/**
+ *
+ * @file comm_split.h
+ *
+ * @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                 Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon comm_split example header
+ *
+ * @version 1.3.0
+ * @author Florent Pruvost
+ * @author Mathieu Faverge
+ * @date 2024-03-19
+ *
+ */
+#ifndef _comm_split_h_
+#define _comm_split_h_
+
+#include <coreblas/lapacke.h>
+#include <chameleon.h>
+#include <chameleon/timer.h>
+#include <chameleon/flops.h>
+#include <mpi.h>
+#include <string.h>
+
+/* Integer parameters for comm_split */
+enum iparam_comm_split {
+    IPARAM_NCPU, /* Number of CPUs                       */
+    IPARAM_N,    /* Number of columns/rows of the matrix */
+    IPARAM_NB,   /* Number of columns/rows in a tile     */
+    IPARAM_NRHS, /* Number of RHS                        */
+    /* End */
+    IPARAM_SIZEOF
+};
+
+/* Specific routines used in comm_split.c main program */
+
+/**
+ * Initialize integer parameters
+ */
+static void init_iparam(int iparam[IPARAM_SIZEOF]){
+    iparam[IPARAM_NCPU ] = -1;
+    iparam[IPARAM_N    ] = 1000;
+    iparam[IPARAM_NB   ] = 500;
+    iparam[IPARAM_NRHS ] = 1;
+ }
+
+/**
+ * Print how to use the program
+ */
+static void show_help(char *prog_name) {
+    printf( "Usage:\n%s [options]\n\n", prog_name );
+    printf( "Options are:\n"
+            "  --help   Show this help\n"
+            "\n"
+            "  --n=X    dimension (N). (default: 1000)\n"
+            "  --nb=X   NB size. (default: 500)\n"
+            "  --nrhs=X number of RHS. (default: 1)\n"
+            "\n"
+            "  --cpus=X Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n"
+            "\n");
+}
+
+static int startswith(const char *s, const char *prefix) {
+    size_t n = strlen( prefix );
+    if (strncmp( s, prefix, n ))
+        return 0;
+    return 1;
+}
+
+/**
+ * Read arguments following comm_split program call
+ */
+static void read_args(int argc, char *argv[], int *iparam){
+    int i;
+    for (i = 1; i < argc && argv[i]; ++i) {
+        if ( startswith( argv[i], "--help") || startswith( argv[i], "-help") ||
+             startswith( argv[i], "--h") || startswith( argv[i], "-h") ) {
+            show_help( argv[0] );
+            exit(0);
+        } else if (startswith( argv[i], "--n=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_N]) );
+        } else if (startswith( argv[i], "--nb=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NB]) );
+        } else if (startswith( argv[i], "--nrhs=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NRHS]) );
+        } else if (startswith( argv[i], "--cpus=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NCPU]) );
+        } else {
+            fprintf( stderr, "Unknown option: %s\n", argv[i] );
+        }
+    }
+}
+
+/**
+ * Print a header message to summarize main parameters
+ */
+static void print_header(char *prog_name, int * iparam) {
+    double    eps = LAPACKE_dlamch_work( 'e' );
+    printf( "#\n"
+            "# CHAMELEON %d.%d.%d, %s\n"
+            "# Nb cpu: %d\n"
+            "# N:      %d\n"
+            "# NB:     %d\n"
+            "# eps:    %e\n"
+            "#\n",
+            CHAMELEON_VERSION_MAJOR,
+            CHAMELEON_VERSION_MINOR,
+            CHAMELEON_VERSION_MICRO,
+            prog_name,
+            iparam[IPARAM_NCPU],
+            iparam[IPARAM_N],
+            iparam[IPARAM_NB],
+            eps );
+
+    printf( "#      M       N  K/NRHS   seconds   Gflop/s\n");
+    printf( "#%7d %7d %7d ", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_NRHS]);
+    fflush( stdout );
+    return;
+}
+
+/**
+ * Check that AX=B is correct
+ */
+static void check(CHAM_desc_t *A, CHAM_desc_t *X, CHAM_desc_t *B) {
+  double anorm = CHAMELEON_dlange_Tile( ChamInfNorm, A);
+  double xnorm = CHAMELEON_dlange_Tile( ChamInfNorm, X);
+  double bnorm = CHAMELEON_dlange_Tile( ChamInfNorm, B);
+  CHAMELEON_dgemm_Tile( ChamNoTrans, ChamNoTrans,
+                        1.0, A, X, -1.0, B );
+  double res = CHAMELEON_dlange_Tile( ChamInfNorm, B );
+  double eps = LAPACKE_dlamch_work( 'e' );
+  int N = X->lm;
+  int hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
+  if ( CHAMELEON_Comm_rank() == 0 ){
+    printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
+    if (hres) {
+      printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
+              res, anorm, xnorm, bnorm,
+              res / N / eps / (anorm * xnorm + bnorm ));
+    }
+    else {
+      printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
+              res, anorm, xnorm, bnorm,
+              res / N / eps / (anorm * xnorm + bnorm ));
+    }
+  }
+  return;
+}
+
+#endif /* _comm_split_h_ */
-- 
GitLab