From 16b7c4ef5ba6d4459403d219cf93c6be523b3280 Mon Sep 17 00:00:00 2001 From: Florent Pruvost <florent.pruvost@inria.fr> Date: Tue, 19 Mar 2024 16:19:20 +0100 Subject: [PATCH] Add an example + utest for using MPI subcommunicators --- example/CMakeLists.txt | 14 ++- example/mpi/CMakeLists.txt | 37 +++++++ example/mpi/comm_split.c | 205 +++++++++++++++++++++++++++++++++++++ example/mpi/comm_split.h | 153 +++++++++++++++++++++++++++ 4 files changed, 404 insertions(+), 5 deletions(-) create mode 100644 example/mpi/CMakeLists.txt create mode 100644 example/mpi/comm_split.c create mode 100644 example/mpi/comm_split.h diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 9619f8073..4681c18ae 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -27,12 +27,16 @@ if (CHAMELEON_SIMULATION) endif() if (CHAMELEON_PREC_D) - add_subdirectory(lapack_to_chameleon) + add_subdirectory(lapack_to_chameleon) else() - message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_chameleon " - "and out_core tutorials cannot be built (use only double arithmetic " - "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build " - "executables of this tutorial.") + message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_chameleon " + "and out_core tutorials cannot be built (use only double arithmetic " + "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build " + "executables of this tutorial.") +endif() + +if(CHAMELEON_USE_MPI AND MPI_C_FOUND AND CHAMELEON_SCHED_STARPU) + add_subdirectory(mpi) endif() ### diff --git a/example/mpi/CMakeLists.txt b/example/mpi/CMakeLists.txt new file mode 100644 index 000000000..c06e975b7 --- /dev/null +++ b/example/mpi/CMakeLists.txt @@ -0,0 +1,37 @@ +### +# +# @file CMakeLists.txt +# +# @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, +# Univ. Bordeaux. All rights reserved. +# +### +# +# CHAMELEON example routines +# CHAMELEON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, +# University of Bordeaux, Bordeaux INP +# +# @version 1.3.0 +# @author Florent Pruvost +# @author Mathieu Faverge +# @date 2024-03-19 +# +### + +set(MPICMD mpiexec --bind-to none -n 4) + +set(SOURCES + comm_split.c + ) + +foreach(_src ${SOURCES}) + get_filename_component(_name_exe ${_src} NAME_WE) + add_executable(${_name_exe} ${_src}) + target_link_libraries(${_name_exe} PRIVATE chameleon coreblas MORSE::LAPACKE) + install(TARGETS ${_name_exe} DESTINATION bin/chameleon/mpi) + add_test(example_mpi_${_name_exe} ${MPICMD} ./${_name_exe}) +endforeach() + +### +### END CMakeLists.txt +### diff --git a/example/mpi/comm_split.c b/example/mpi/comm_split.c new file mode 100644 index 000000000..834582be6 --- /dev/null +++ b/example/mpi/comm_split.c @@ -0,0 +1,205 @@ +/** + * + * @file comm_split.c + * + * @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon comm_split example + * + * @version 1.3.0 + * @author Florent Pruvost + * @author Mathieu Faverge + * @date 2024-03-19 + * + */ +#include "comm_split.h" + +/* + * @brief comm_split introduces how to use CHAMELEON with MPI + * subcommunicators. + * @details This example shows that Chameleon can be used with custom MPI + * communicators (different from MPI_COMM_WORLD). Here two different algorithms + * (potrf and getrf_nopiv) are called at the same time on two different + * communicators A (0, 2) and B (1, 3). To use this program properly CHAMELEON + * must use StarPU Runtime system and MPI option must be activated at + * configure. This program is meant to be run with 4 MPI processes and the data + * distribution on matrices is 2D block cyclic P=2, Q=1. + */ +int main(int argc, char *argv[]) { + + /* Check that MPI has threads support */ + int thread_support; + if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) + { + fprintf(stderr,"MPI_Init_thread failed\n"); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + } + if (thread_support == MPI_THREAD_FUNNELED) + fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); + if (thread_support < MPI_THREAD_FUNNELED) + fprintf(stderr,"Warning: MPI does not have thread support!\n"); + + /* Check that 4 MPI processes are used */ + int comm_size; + MPI_Comm_size(MPI_COMM_WORLD, &comm_size); + if(comm_size != 4) + { + printf("This application is meant to be run with 4 MPI processes, not %d.\n", comm_size); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + } + + /* Get my rank in the global communicator */ + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + /* Determine the colour and key based on whether my rank is even. */ + char subcommunicator; + int colour; + int key = my_rank/2; + if(my_rank % 2 == 0) + { + subcommunicator = 'A'; + colour = 0; + } + else + { + subcommunicator = 'B'; + colour = 1; + } + + /* Split the global communicator */ + MPI_Comm new_comm; + MPI_Comm_split(MPI_COMM_WORLD, colour, key, &new_comm); + + /* Get my rank in the new communicator */ + int my_new_comm_rank; + MPI_Comm_rank(new_comm, &my_new_comm_rank); + + /* Print my new rank and new communicator */ + printf("[MPI process %d] MPI process %d in subcommunicator %c.\n", my_rank, my_new_comm_rank, subcommunicator); + + /* initialize some parameters with default values */ + int iparam[IPARAM_SIZEOF]; + memset(iparam, 0, IPARAM_SIZEOF*sizeof(int)); + init_iparam(iparam); + + /* read arguments */ + read_args(argc, argv, iparam); + int N = iparam[IPARAM_N]; // matrix order + int NB = iparam[IPARAM_NB]; // number of rows and columns in tiles + int NRHS = iparam[IPARAM_NRHS]; // number of RHS vectors + int NCPU = iparam[IPARAM_NCPU]; // number of CPU cores to use + int P = 2; + int Q = 1; + + /* Initialize CHAMELEON with custom communicator */ + CHAMELEON_InitParComm( NCPU, 0, 1, new_comm ); + + CHAMELEON_Set(CHAMELEON_TILE_SIZE, iparam[IPARAM_NB] ); + + /* declarations to time the program and evaluate performances */ + double flops, gflops, cpu_time; + + if(my_rank % 2 == 0) + { + /* Cholesky on subcommunicator A i.e. 0,2 */ + + /* Initialize matrices */ + CHAM_desc_t *descA, *descAC, *descB, *descX; + + CHAMELEON_Desc_Create( &descA, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q ); + CHAMELEON_Desc_Create( &descAC, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q ); + CHAMELEON_Desc_Create( &descB, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q ); + CHAMELEON_Desc_Create( &descX, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q ); + + CHAMELEON_dplgsy_Tile( (double)N, ChamUpperLower, descA, 20 ); + CHAMELEON_dplrnt_Tile( descB, 21 ); + + CHAMELEON_dlacpy_Tile( ChamUpperLower, descA, descAC); + CHAMELEON_dlacpy_Tile( ChamUpperLower, descB, descX); + + cpu_time = -CHAMELEON_timer(); + + /* Solve AX = B by Cholesky factorization */ + CHAMELEON_dposv_Tile( ChamLower, descA, descX ); + + cpu_time += CHAMELEON_timer(); + + flops = flops_dpotrf( N ) + flops_dpotrs( N, NRHS ); + gflops = flops * 1.e-9 / cpu_time; + if ( CHAMELEON_Comm_rank() == 0 ) { + printf( "\nCholesky performances on subcommunicator %c:\n", subcommunicator); + print_header( argv[0], iparam); + printf( "%9.3f %9.2f\n", cpu_time, gflops); + } + fflush( stdout ); + + /* compute norms to check the result */ + check( descAC, descX, descB ); + + CHAMELEON_Desc_Destroy( &descX ); + CHAMELEON_Desc_Destroy( &descAC ); + CHAMELEON_Desc_Destroy( &descB ); + CHAMELEON_Desc_Destroy( &descA ); + } + else + { + /* LU nopiv on subcommunicator B i.e. 1,3 */ + + /* Initialize matrices */ + CHAM_desc_t *descA, *descAC, *descB, *descX; + + CHAMELEON_Desc_Create( &descA, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q ); + CHAMELEON_Desc_Create( &descAC, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, N, 0, 0, N, N, P, Q ); + CHAMELEON_Desc_Create( &descB, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q ); + CHAMELEON_Desc_Create( &descX, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, + NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, P, Q ); + + CHAMELEON_dplgtr_Tile( 0 , ChamUpper, descA, 10 ); + CHAMELEON_dplgtr_Tile( (double)N, ChamLower, descA, 11 ); + CHAMELEON_dplrnt_Tile( descB, 12 ); + + CHAMELEON_dlacpy_Tile( ChamUpperLower, descA, descAC); + CHAMELEON_dlacpy_Tile( ChamUpperLower, descB, descX); + + cpu_time = -CHAMELEON_timer(); + + /* Solve AX = B by LU factorization */ + CHAMELEON_dgesv_nopiv_Tile( descAC, descX ); + + cpu_time += CHAMELEON_timer(); + + flops = flops_dgetrf( N, N ) + flops_dgetrs( N, NRHS ); + gflops = flops * 1.e-9 / cpu_time; + if ( CHAMELEON_Comm_rank() == 0 ) { + printf( "\nLU nopiv performances on subcommunicator %c:\n", subcommunicator); + print_header( argv[0], iparam); + printf( "%9.3f %9.2f\n", cpu_time, gflops); + } + fflush( stdout ); + + /* compute norms to check the result */ + check( descA, descX, descB ); + + CHAMELEON_Desc_Destroy( &descX ); + CHAMELEON_Desc_Destroy( &descAC ); + CHAMELEON_Desc_Destroy( &descB ); + CHAMELEON_Desc_Destroy( &descA ); + } + + CHAMELEON_Finalize(); + + MPI_Finalize(); + + return EXIT_SUCCESS; +} diff --git a/example/mpi/comm_split.h b/example/mpi/comm_split.h new file mode 100644 index 000000000..ecbd19f5a --- /dev/null +++ b/example/mpi/comm_split.h @@ -0,0 +1,153 @@ +/** + * + * @file comm_split.h + * + * @copyright 2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon comm_split example header + * + * @version 1.3.0 + * @author Florent Pruvost + * @author Mathieu Faverge + * @date 2024-03-19 + * + */ +#ifndef _comm_split_h_ +#define _comm_split_h_ + +#include <coreblas/lapacke.h> +#include <chameleon.h> +#include <chameleon/timer.h> +#include <chameleon/flops.h> +#include <mpi.h> +#include <string.h> + +/* Integer parameters for comm_split */ +enum iparam_comm_split { + IPARAM_NCPU, /* Number of CPUs */ + IPARAM_N, /* Number of columns/rows of the matrix */ + IPARAM_NB, /* Number of columns/rows in a tile */ + IPARAM_NRHS, /* Number of RHS */ + /* End */ + IPARAM_SIZEOF +}; + +/* Specific routines used in comm_split.c main program */ + +/** + * Initialize integer parameters + */ +static void init_iparam(int iparam[IPARAM_SIZEOF]){ + iparam[IPARAM_NCPU ] = -1; + iparam[IPARAM_N ] = 1000; + iparam[IPARAM_NB ] = 500; + iparam[IPARAM_NRHS ] = 1; + } + +/** + * Print how to use the program + */ +static void show_help(char *prog_name) { + printf( "Usage:\n%s [options]\n\n", prog_name ); + printf( "Options are:\n" + " --help Show this help\n" + "\n" + " --n=X dimension (N). (default: 1000)\n" + " --nb=X NB size. (default: 500)\n" + " --nrhs=X number of RHS. (default: 1)\n" + "\n" + " --cpus=X Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n" + "\n"); +} + +static int startswith(const char *s, const char *prefix) { + size_t n = strlen( prefix ); + if (strncmp( s, prefix, n )) + return 0; + return 1; +} + +/** + * Read arguments following comm_split program call + */ +static void read_args(int argc, char *argv[], int *iparam){ + int i; + for (i = 1; i < argc && argv[i]; ++i) { + if ( startswith( argv[i], "--help") || startswith( argv[i], "-help") || + startswith( argv[i], "--h") || startswith( argv[i], "-h") ) { + show_help( argv[0] ); + exit(0); + } else if (startswith( argv[i], "--n=" )) { + sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_N]) ); + } else if (startswith( argv[i], "--nb=" )) { + sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NB]) ); + } else if (startswith( argv[i], "--nrhs=" )) { + sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NRHS]) ); + } else if (startswith( argv[i], "--cpus=" )) { + sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NCPU]) ); + } else { + fprintf( stderr, "Unknown option: %s\n", argv[i] ); + } + } +} + +/** + * Print a header message to summarize main parameters + */ +static void print_header(char *prog_name, int * iparam) { + double eps = LAPACKE_dlamch_work( 'e' ); + printf( "#\n" + "# CHAMELEON %d.%d.%d, %s\n" + "# Nb cpu: %d\n" + "# N: %d\n" + "# NB: %d\n" + "# eps: %e\n" + "#\n", + CHAMELEON_VERSION_MAJOR, + CHAMELEON_VERSION_MINOR, + CHAMELEON_VERSION_MICRO, + prog_name, + iparam[IPARAM_NCPU], + iparam[IPARAM_N], + iparam[IPARAM_NB], + eps ); + + printf( "# M N K/NRHS seconds Gflop/s\n"); + printf( "#%7d %7d %7d ", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_NRHS]); + fflush( stdout ); + return; +} + +/** + * Check that AX=B is correct + */ +static void check(CHAM_desc_t *A, CHAM_desc_t *X, CHAM_desc_t *B) { + double anorm = CHAMELEON_dlange_Tile( ChamInfNorm, A); + double xnorm = CHAMELEON_dlange_Tile( ChamInfNorm, X); + double bnorm = CHAMELEON_dlange_Tile( ChamInfNorm, B); + CHAMELEON_dgemm_Tile( ChamNoTrans, ChamNoTrans, + 1.0, A, X, -1.0, B ); + double res = CHAMELEON_dlange_Tile( ChamInfNorm, B ); + double eps = LAPACKE_dlamch_work( 'e' ); + int N = X->lm; + int hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 ); + if ( CHAMELEON_Comm_rank() == 0 ){ + printf( " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN\n"); + if (hres) { + printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE \n", + res, anorm, xnorm, bnorm, + res / N / eps / (anorm * xnorm + bnorm )); + } + else { + printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS \n", + res, anorm, xnorm, bnorm, + res / N / eps / (anorm * xnorm + bnorm )); + } + } + return; +} + +#endif /* _comm_split_h_ */ -- GitLab