diff --git a/compute/pzgered.c b/compute/pzgered.c index c1624db1ba06c0f43a7d84ea485a089d33965ade..1051ee91f45370bba6cefbdb21d3b17a6862c99f 100644 --- a/compute/pzgered.c +++ b/compute/pzgered.c @@ -13,7 +13,8 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> z d * */ @@ -28,8 +29,8 @@ static inline void chameleon_pzgered_frb( cham_uplo_t uplo, - CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, - RUNTIME_option_t *options ) + CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, + RUNTIME_option_t *options ) { double alpha = 1.0; double beta = 0.0; @@ -233,21 +234,17 @@ void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, for(n = nmin; n < nmax; n++) { CHAM_tile_t *tile = A->get_blktile( A, m, n ); - if ( tile->rank == A->myrank ) { - int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; - - /* Get the frobenius norm of the tile A( m, n ) */ - lnorm = ((double*)((Wcol.get_blktile( &Wcol, m, n ))->mat))[0]; - - /* - * u_{high} = 1e-16 (later should be application accuraccy) - * u_{low} = 1e-8 - * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) - * ||A_{i,j}||_F < threshold / u_{low} - */ - INSERT_TASK_zgered( &options, threshold, lnorm, - tempmm, tempnn, A( m, n ) ); - } + + int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; + + /* + * u_{high} = 1e-16 (later should be application accuracy) + * u_{low} = 1e-8 + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < threshold / u_{low} + */ + INSERT_TASK_zgered( &options, threshold, + tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) ); } } diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 5d667cca39e1fe42eb61d29257ac45e38e2f3075..75e2148435d8fb5e934b437620cf7f1701a5fcf6 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -23,7 +23,8 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2024-04-03 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> c d s * */ @@ -168,10 +169,8 @@ int CHAMELEON_zplrnk_Tile(int K, CHAM_desc_t *C, unsigned long long int seedA, u int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); -#if defined(PRECISION_z) || defined(PRECISION_d) int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); -#endif int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotrimm_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C); @@ -249,10 +248,8 @@ int CHAMELEON_zplrnk_Tile_Async(int K, CHAM_desc_t *C, unsigned long long int se int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -#if defined(PRECISION_z) || defined(PRECISION_d) int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); -#endif int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index b330ec7d840bb3136f8575e240bed5b8a9bc5847..795ebd2d186f9c1e88a44ab6312d40583b1a4d5d 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -24,7 +24,8 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-09-11 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> c d s * */ @@ -79,8 +80,9 @@ void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ); + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ); void INSERT_TASK_zgerst( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An ); diff --git a/runtime/openmp/codelets/codelet_zgered.c b/runtime/openmp/codelets/codelet_zgered.c index 19e6f9118969c74540a6a729af02f56f47ea47c6..20b0c191205ec8fe333c556943bac9d520ddc5f1 100644 --- a/runtime/openmp/codelets/codelet_zgered.c +++ b/runtime/openmp/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_openmp.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with OpenMP\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/parsec/codelets/codelet_zgered.c b/runtime/parsec/codelets/codelet_zgered.c index dcc20888b04936244f2e6ddade9ad3932a3b8413..338a7b5ff34c0f2a3f7b7bab193bd1aa4c049bd5 100644 --- a/runtime/parsec/codelets/codelet_zgered.c +++ b/runtime/parsec/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_parsec.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with PaRSEC\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/quark/codelets/codelet_zgered.c b/runtime/quark/codelets/codelet_zgered.c index 773bd7cd94dd1e20f57ef0c0f577a5bc98d68d33..b07695f70e0ebb9a10669965efa2917383f693d4 100644 --- a/runtime/quark/codelets/codelet_zgered.c +++ b/runtime/quark/codelets/codelet_zgered.c @@ -11,24 +11,28 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2023-07-06 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> d * */ #include "chameleon_quark.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { fprintf( stderr, "WARNING: gered kernel is not available with Quark\n" ); (void)options; (void)threshold; - (void)Anorm; (void)m; (void)n; (void)A; (void)Am; (void)An; + (void)Wnorm; + (void)Wnm; + (void)Wnn; } diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c index 11b002cea3900c7e6c4ac792dd830e5a809b5be3..fe1c4927ef525aa24dd53a6b83f22d3c5e9959f4 100644 --- a/runtime/starpu/codelets/codelet_zgered.c +++ b/runtime/starpu/codelets/codelet_zgered.c @@ -13,6 +13,7 @@ * * @version 1.3.0 * @author Mathieu Faverge + * @author Ana Hourcau * @date 2024-07-17 * @precisions normal z -> d * @@ -23,21 +24,35 @@ #include "runtime_codelet_z.h" void INSERT_TASK_zgered( const RUNTIME_option_t *options, - double threshold, double Anorm, int m, int n, - const CHAM_desc_t *A, int Am, int An ) + double threshold, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *Wnorm, int Wnm, int Wnn ) { CHAM_tile_t *tileA; - double u_low; + double u_low, lnorm; int64_t mm, nn; -#if defined(CHAMELEON_USE_MPI) - int tag; -#endif + int tag = -1; starpu_data_handle_t *handleAin; starpu_data_handle_t handleAout; - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A, Am, An); - CHAMELEON_END_ACCESS_DECLARATION; + /* + * Collect the norm of the tile on all nodes to do the the data conversion + * if owned, and only the new data registration if not owned + */ + { + starpu_data_handle_t handleNorm = RTBLKADDR( Wnorm, ChamDouble, Wnm, Wnn ); + CHAM_tile_t *tileNorm; + +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_get_data_on_all_nodes_detached( options->sequence->comm, handleNorm ); +#endif + starpu_data_acquire( handleNorm, STARPU_R ); + + tileNorm = cti_handle_get( handleNorm ); + lnorm = ((double *)(tileNorm->mat))[0]; + + starpu_data_release( handleNorm ); + } /* Get the Input handle */ mm = Am + (A->i / A->mb); @@ -45,8 +60,6 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, handleAin = A->schedopt; handleAin += ((int64_t)A->lmt) * nn + mm; - assert( *handleAin != NULL ); - /* * Lets convert the tile precision based on the following criteria: * @@ -54,10 +67,14 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, * ||A_{i,j}||_F < u_{high} * || A ||_F / nt * 1/ u_{low} * ||A_{i,j}||_F < threshold / u_{low} */ - tileA = A->get_blktile( A, Am, An ); + #if defined(CHAMELEON_USE_MPI) - tag = starpu_mpi_data_get_tag( *handleAin ); + /* Backup the MPI tag */ + if (A->myrank == tileA->rank) + { + tag = starpu_mpi_data_get_tag( *handleAin ); + } #endif /* defined(CHAMELEON_USE_MPI) */ #if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500) @@ -67,11 +84,12 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, * Check for half precision */ u_low = 1.e-4; - if ( Anorm < (threshold / u_low) ) { + if ( lnorm < (threshold / u_low) ) + { #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert the tile ( %d, %d ) to half precision\n", - A->myrank, Am, An ); + A->myrank, Am, An); #endif starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf ); @@ -88,14 +106,22 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, #endif 0); - starpu_data_unregister_submit( *handleAin ); + starpu_data_unregister_no_coherency( *handleAin ); *handleAin = handleAout; tileA->flttype = ChamComplexHalf; -#if defined(CHAMELEON_USE_MPI) starpu_mpi_data_register( handleAout, tag, tileA->rank ); -#endif - return; } + else + { + tileA->flttype = ChamComplexHalf; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + } + return; + } #endif #endif @@ -108,33 +134,44 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options, #else u_low = 1e-8; #endif - if ( Anorm < (threshold / u_low) ) { + if ( lnorm < (threshold / u_low) ) + { #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert the tile ( %d, %d ) to single precision\n", A->myrank, Am, An ); #endif - starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); + if (A->myrank == tileA->rank) + { + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); - rt_shm_starpu_insert_task( - &cl_zlag2c, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_R, *handleAin, - STARPU_W, handleAout, - STARPU_PRIORITY, options->priority, - STARPU_EXECUTE_ON_WORKER, options->workerid, + rt_shm_starpu_insert_task( + &cl_zlag2c, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlag2c", + STARPU_NAME, "zlag2c", #endif - 0); + 0); - starpu_data_unregister_submit( *handleAin ); - *handleAin = handleAout; - tileA->flttype = ChamComplexFloat; -#if defined(CHAMELEON_USE_MPI) - starpu_mpi_data_register( *handleAin, tag, tileA->rank ); -#endif + starpu_data_unregister_no_coherency( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexFloat; + starpu_mpi_data_register( *handleAin, tag, tileA->rank ); + } + else + { + tileA->flttype = ChamComplexFloat; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + } return; } } diff --git a/runtime/starpu/codelets/codelet_zgerst.c b/runtime/starpu/codelets/codelet_zgerst.c index ba6b2bb8332c367c5191b8cf052a67844cfdc695..9a5c825f149c171dd2ad14f812d6bab7ed926546 100644 --- a/runtime/starpu/codelets/codelet_zgerst.c +++ b/runtime/starpu/codelets/codelet_zgerst.c @@ -11,6 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge + * @author Ana Hourcau * @date 2024-07-17 * @precisions normal z -> d * @@ -26,20 +27,11 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, { CHAM_tile_t *tileA; int64_t mm, nn; -#if defined(CHAMELEON_USE_MPI) - int tag; -#endif + int tag = -1; starpu_data_handle_t *handleAin; starpu_data_handle_t handleAout; - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A, Am, An); - CHAMELEON_END_ACCESS_DECLARATION; - tileA = A->get_blktile( A, Am, An ); - if ( tileA->flttype == ChamComplexDouble ) { - return; - } /* Get the Input handle */ mm = Am + (A->i / A->mb); @@ -47,7 +39,36 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, handleAin = A->schedopt; handleAin += ((int64_t)A->lmt) * nn + mm; - assert( *handleAin != NULL ); + if ( tileA->flttype == ChamComplexDouble ) { + starpu_data_handle_t *copy = handleAin; + + /* Remove first copy */ + copy += ((int64_t)A->lmt * (int64_t)A->lnt); + if ( *copy ) { + starpu_data_unregister_no_coherency( *copy ); + *copy = NULL; + } + + /* Remove second copy */ + copy += ((int64_t)A->lmt * (int64_t)A->lnt); + if ( *copy ) { + starpu_data_unregister_no_coherency( *copy ); + *copy = NULL; + } + + return; + } + + if (A->myrank != tileA->rank) + { + tileA->flttype = ChamComplexDouble; + if (*handleAin != NULL) + { + starpu_data_unregister_no_coherency(*handleAin); + *handleAin = NULL; + } + return; + } #if defined(CHAMELEON_USE_MPI) tag = starpu_mpi_data_get_tag( *handleAin ); @@ -62,6 +83,7 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, * Restore from half precision */ case ChamComplexHalf: + assert( options->withcuda ); #if defined(CHAMELEON_DEBUG_GERED) fprintf( stderr, "[%2d] Convert back the tile ( %d, %d ) from half precision\n", @@ -107,10 +129,8 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options, fprintf( stderr, "ERROR: Unknonw input datatype" ); } - starpu_data_unregister_submit( *handleAin ); + starpu_data_unregister_no_coherency( *handleAin ); *handleAin = handleAout; tileA->flttype = ChamComplexDouble; -#if defined(CHAMELEON_USE_MPI) starpu_mpi_data_register( handleAout, tag, tileA->rank ); -#endif } diff --git a/runtime/starpu/include/cham_tile_interface.h b/runtime/starpu/include/cham_tile_interface.h index 8abc48abcabd665bec47975bef768bc21850d8b4..5dc7672d8c90c4127cdf956f0e1bb8d4e718634a 100644 --- a/runtime/starpu/include/cham_tile_interface.h +++ b/runtime/starpu/include/cham_tile_interface.h @@ -9,10 +9,11 @@ * * @brief Header to describe the Chameleon tile interface in StarPU * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Gwenole Lucas - * @date 2022-02-22 + * @author Ana Hourcau + * @date 2024-07-17 * */ #ifndef _cham_tile_interface_h_ @@ -53,6 +54,20 @@ cti_interface_get( starpu_cham_tile_interface_t *interface ) return &(interface->tile); } +static inline CHAM_tile_t * +cti_handle_get( starpu_data_handle_t handle ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + return &(cham_tile_interface->tile); +} + void starpu_cham_tile_interface_init(); void starpu_cham_tile_interface_fini(); diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index 352d7bd288833a3ffcb0e0d04f7cf06a6f96fe2f..89904548b70ed8ecced9b07fe76c9d0c541e66fb 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -13,7 +13,9 @@ * @author Mathieu Faverge * @author Gwenole Lucas * @author Samuel Thibault - * @date 2023-08-22 + * @author Abel Calluaud + * @author Ana Hourcau + * @date 2024-07-17 * */ #include "chameleon_starpu.h" @@ -77,20 +79,6 @@ cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface _ } #endif -static inline CHAM_tile_t * -cti_handle_get( starpu_data_handle_t handle ) -{ - starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) - starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); - -#ifdef STARPU_DEBUG - STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, - "Error. The given data is not a cham_tile." ); -#endif - - return &(cham_tile_interface->tile); -} - int cti_handle_get_m( starpu_data_handle_t handle ) {