From 84ba24fbf6d2f7d372fc26e10138644afa5802a0 Mon Sep 17 00:00:00 2001 From: Ana Hourcau <ana.hourcau@inria.fr> Date: Mon, 3 Jun 2024 16:51:45 +0200 Subject: [PATCH] starpu/codelet: Improve reduction at a runtime level to avoid mixed-up tags problem --- compute/pzgered.c | 33 +++--- include/chameleon/chameleon_z.h | 7 +- include/chameleon/tasks_z.h | 8 +- runtime/openmp/codelets/codelet_zgered.c | 12 +- runtime/parsec/codelets/codelet_zgered.c | 12 +- runtime/quark/codelets/codelet_zgered.c | 12 +- runtime/starpu/codelets/codelet_zgered.c | 111 ++++++++++++------ runtime/starpu/codelets/codelet_zgerst.c | 48 +++++--- runtime/starpu/include/cham_tile_interface.h | 19 ++- .../starpu/interface/cham_tile_interface.c | 18 +-- 10 files changed, 174 insertions(+), 106 deletions(-) diff --git a/compute/pzgered.c b/compute/pzgered.c index c1624db1b..1051ee91f 100644 --- a/compute/pzgered.c +++ b/compute/pzgered.c @@ -13,7 +13,8 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> z d * */ @@ -28,8 +29,8 @@ static inline void chameleon_pzgered_frb( cham_uplo_t uplo, - CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, - RUNTIME_option_t *options ) + CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, + RUNTIME_option_t *options ) { double alpha = 1.0; double beta = 0.0; @@ -233,21 +234,17 @@ void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, for(n = nmin; n < nmax; n++) { CHAM_tile_t *tile = A->get_blktile( A, m, n ); - if ( tile->rank == A->myrank ) { - int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; - - /* Get the frobenius norm of the tile A( m, n ) */ - lnorm = ((double*)((Wcol.get_blktile( &Wcol, m, n ))->mat))[0]; - - /* - * u_{high} = 1e-16 (later should be application accuraccy) - * u_{low} = 1e-8 - * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) - * ||A_{i,j}||_F < threshold / u_{low} - */ - INSERT_TASK_zgered( &options, threshold, lnorm, - tempmm, tempnn, A( m, n ) ); - } + + int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; + + /* + * u_{high} = 1e-16 (later should be application accuracy) + * u_{low} = 1e-8 + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < threshold / u_{low} + */ + INSERT_TASK_zgered( &options, threshold, + tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) ); } } diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 5d667cca3..75e214843 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -23,7 +23,8 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2024-04-03 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> c d s * */ @@ -168,10 +169,8 @@ int CHAMELEON_zplrnk_Tile(int K, CHAM_desc_t *C, unsigned long long int seedA, u int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); -#if defined(PRECISION_z) || defined(PRECISION_d) int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); -#endif int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotrimm_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C); @@ -249,10 +248,8 @@ int CHAMELEON_zplrnk_Tile_Async(int K, CHAM_desc_t *C, unsigned long long int se int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -#if defined(PRECISION_z) || defined(PRECISION_d) int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); -#endif int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index b330ec7d8..795ebd2d1 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -24,7 +24,8 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-09-11 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> c d s * */ @@ -79,8 +80,9 @@ void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ); + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ); void INSERT_TASK_zgerst( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An ); diff --git a/runtime/openmp/codelets/codelet_zgered.c b/runtime/openmp/codelets/codelet_zgered.c index 19e6f9118..20b0c1912 100644 --- a/runtime/openmp/codelets/codelet_zgered.c +++ b/runtime/openmp/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_openmp.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with OpenMP\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/parsec/codelets/codelet_zgered.c b/runtime/parsec/codelets/codelet_zgered.c index dcc20888b..338a7b5ff 100644 --- a/runtime/parsec/codelets/codelet_zgered.c +++ b/runtime/parsec/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_parsec.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with PaRSEC\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/quark/codelets/codelet_zgered.c b/runtime/quark/codelets/codelet_zgered.c index 773bd7cd9..b07695f70 100644 --- a/runtime/quark/codelets/codelet_zgered.c +++ b/runtime/quark/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_quark.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with Quark\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c index 11b002cea..fe1c4927e 100644 --- a/runtime/starpu/codelets/codelet_zgered.c +++ b/runtime/starpu/codelets/codelet_zgered.c @@ -13,6 +13,7 @@ * * @version 1.3.0 * @author Mathieu Faverge + * @author Ana Hourcau * @date 2024-07-17 * @precisions normal z -> d * @@ -23,21 +24,35 @@ #include "runtime_codelet_z.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { CHAM_tile_t *tileA; - double u_low; + double u_low, lnorm; int64_t mm, nn; -#if defined(CHAMELEON_USE_MPI) - int tag; -#endif + int tag = -1; starpu_data_handle_t *handleAin; starpu_data_handle_t handleAout; - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A, Am, An); - CHAMELEON_END_ACCESS_DECLARATION; + /* + * Collect the norm of the tile on all nodes to do the the data conversion + * if owned, and only the new data registration if not owned + */ + { + starpu_data_handle_t handleNorm = RTBLKADDR( Wnorm, ChamDouble, Wnm, Wnn ); + CHAM_tile_t *tileNorm; + +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_get_data_on_all_nodes_detached( options->sequence->comm, handleNorm ); +#endif + starpu_data_acquire( handleNorm, STARPU_R ); + + tileNorm = cti_handle_get( handleNorm ); + lnorm = ((double *)(tileNorm->mat))[0]; + + starpu_data_release( handleNorm ); + } /* Get the Input handle */ mm = Am + (A->i / A->mb); @@ -45,8 +60,6 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, handleAin = A->schedopt; handleAin += ((int64_t)A->lmt) * nn + mm; - assert( *handleAin != NULL ); - /* * Lets convert the tile precision based on the following criteria: * @@ -54,10 +67,14 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, * ||A_{i,j}||_F < u_{high} * || A ||_F / nt * 1/ u_{low} * ||A_{i,j}||_F < threshold / u_{low} */ - tileA = A->get_blktile( A, Am, An ); + #if defined(CHAMELEON_USE_MPI) - tag = starpu_mpi_data_get_tag( *handleAin ); + /* Backup the MPI tag */ + if (A->myrank == tileA->rank) + { + tag = starpu_mpi_data_get_tag( *handleAin ); + } #endif /* defined(CHAMELEON_USE_MPI) */ #if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500) @@ -67,11 +84,12 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, * Check for half precision */ u_low = 1.e-4; - if ( Anorm < (threshold / u_low) ) { + if ( lnorm < (threshold / u_low) ) + { #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert the tile ( %d, %d ) to half precision\n", - A->myrank, Am, An ); + A->myrank, Am, An); #endif starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf ); @@ -88,14 +106,22 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, #endif 0); - starpu_data_unregister_submit( *handleAin ); + starpu_data_unregister_no_coherency( *handleAin ); *handleAin = handleAout; tileA->flttype = ChamComplexHalf; -#if defined(CHAMELEON_USE_MPI) starpu_mpi_data_register( handleAout, tag, tileA->rank ); -#endif - return; } + else + { + tileA->flttype = ChamComplexHalf; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + } + return; + } #endif #endif @@ -108,33 +134,44 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, #else u_low = 1e-8; #endif - if ( Anorm < (threshold / u_low) ) { + if ( lnorm < (threshold / u_low) ) + { #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert the tile ( %d, %d ) to single precision\n", A->myrank, Am, An ); #endif - starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); + if (A->myrank == tileA->rank) + { + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); - rt_shm_starpu_insert_task( - &cl_zlag2c, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_R, *handleAin, - STARPU_W, handleAout, - STARPU_PRIORITY, options->priority, - STARPU_EXECUTE_ON_WORKER, options->workerid, + rt_shm_starpu_insert_task( + &cl_zlag2c, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlag2c", + STARPU_NAME, "zlag2c", #endif - 0); + 0); - starpu_data_unregister_submit( *handleAin ); - *handleAin = handleAout; - tileA->flttype = ChamComplexFloat; -#if defined(CHAMELEON_USE_MPI) - starpu_mpi_data_register( *handleAin, tag, tileA->rank ); -#endif + starpu_data_unregister_no_coherency( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexFloat; + starpu_mpi_data_register( *handleAin, tag, tileA->rank ); + } + else + { + tileA->flttype = ChamComplexFloat; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + } return; } } diff --git a/runtime/starpu/codelets/codelet_zgerst.c b/runtime/starpu/codelets/codelet_zgerst.c index ba6b2bb83..9a5c825f1 100644 --- a/runtime/starpu/codelets/codelet_zgerst.c +++ b/runtime/starpu/codelets/codelet_zgerst.c @@ -11,6 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge + * @author Ana Hourcau * @date 2024-07-17 * @precisions normal z -> d * @@ -26,20 +27,11 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, { CHAM_tile_t *tileA; int64_t mm, nn; -#if defined(CHAMELEON_USE_MPI) - int tag; -#endif + int tag = -1; starpu_data_handle_t *handleAin; starpu_data_handle_t handleAout; - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A, Am, An); - CHAMELEON_END_ACCESS_DECLARATION; - tileA = A->get_blktile( A, Am, An ); - if ( tileA->flttype == ChamComplexDouble ) { - return; - } /* Get the Input handle */ mm = Am + (A->i / A->mb); @@ -47,7 +39,36 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, handleAin = A->schedopt; handleAin += ((int64_t)A->lmt) * nn + mm; - assert( *handleAin != NULL ); + if ( tileA->flttype == ChamComplexDouble ) { + starpu_data_handle_t *copy = handleAin; + + /* Remove first copy */ + copy += ((int64_t)A->lmt * (int64_t)A->lnt); + if ( *copy ) { + starpu_data_unregister_no_coherency( *copy ); + *copy = NULL; + } + + /* Remove second copy */ + copy += ((int64_t)A->lmt * (int64_t)A->lnt); + if ( *copy ) { + starpu_data_unregister_no_coherency( *copy ); + *copy = NULL; + } + + return; + } + + if (A->myrank != tileA->rank) + { + tileA->flttype = ChamComplexDouble; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + return; + } #if defined(CHAMELEON_USE_MPI) tag = starpu_mpi_data_get_tag( *handleAin ); @@ -62,6 +83,7 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, * Restore from half precision */ case ChamComplexHalf: + assert( options->withcuda ); #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert back the tile ( %d, %d ) from half precision\n", @@ -107,10 +129,8 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, fprintf( stderr, "ERROR: Unknonw input datatype" ); } - starpu_data_unregister_submit( *handleAin ); + starpu_data_unregister_no_coherency( *handleAin ); *handleAin = handleAout; tileA->flttype = ChamComplexDouble; -#if defined(CHAMELEON_USE_MPI) starpu_mpi_data_register( handleAout, tag, tileA->rank ); -#endif } diff --git a/runtime/starpu/include/cham_tile_interface.h b/runtime/starpu/include/cham_tile_interface.h index 8abc48abc..5dc7672d8 100644 --- a/runtime/starpu/include/cham_tile_interface.h +++ b/runtime/starpu/include/cham_tile_interface.h @@ -9,10 +9,11 @@ * * @brief Header to describe the Chameleon tile interface in StarPU * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Gwenole Lucas - * @date 2022-02-22 + * @author Ana Hourcau + * @date 2024-07-17 * */ #ifndef _cham_tile_interface_h_ @@ -53,6 +54,20 @@ cti_interface_get( starpu_cham_tile_interface_t *interface ) return &(interface->tile); } +static inline CHAM_tile_t * +cti_handle_get( starpu_data_handle_t handle ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + return &(cham_tile_interface->tile); +} + void starpu_cham_tile_interface_init(); void starpu_cham_tile_interface_fini(); diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index 352d7bd28..89904548b 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -13,7 +13,9 @@ * @author Mathieu Faverge * @author Gwenole Lucas * @author Samuel Thibault - * @date 2023-08-22 + * @author Abel Calluaud + * @author Ana Hourcau + * @date 2024-07-17 * */ #include "chameleon_starpu.h" @@ -77,20 +79,6 @@ cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface _ } #endif -static inline CHAM_tile_t * -cti_handle_get( starpu_data_handle_t handle ) -{ - starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) - starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); - -#ifdef STARPU_DEBUG - STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, - "Error. The given data is not a cham_tile." ); -#endif - - return &(cham_tile_interface->tile); -} - int cti_handle_get_m( starpu_data_handle_t handle ) { -- GitLab