diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index b90df480a365381097b434ad2d09ad414131b9af..afd17c16f2a60d1b5cb35616151072872fdb3de2 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -51,6 +51,7 @@ _extra_blas = [ ('', 'sprint', 'dprint', 'cprint', 'zprint' ), ('', 'sgered', 'dgered', 'cgered', 'zgered' ), ('', 'sgerst', 'dgerst', 'cgerst', 'zgerst' ), + ('', 'sipiv_allreduce', 'dipiv_allreduce', 'cipiv_allreduce', 'zipiv_allreduce' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index e6f3d107a911b0a29ed40fffe204f7f84ad5d259..a56e1c9a1c5220acf8ae09db0715d4d331e99ffe 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -16,6 +16,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-16 * @precisions normal z -> s d c * @@ -146,15 +147,13 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, ipiv ); } - if ( h < minmn ) { - /* Reduce globally (between MPI processes) */ - INSERT_TASK_ipiv_reducek( options, ipiv, k, h ); - } + /* Reduce globally (between MPI processes) */ + INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); } /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); - RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } /* @@ -195,17 +194,14 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); - if ( h < minmn ) { - /* Reduce globally (between MPI processes) */ - INSERT_TASK_ipiv_reducek( options, ipiv, k, h ); - } + INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); } free( clargs ); /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); - RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } static inline void @@ -218,6 +214,10 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, int m, h, b, nbblock; int tempkm, tempkn, tempmm, minmn; + if ( ! ws->involved ) { + return; + } + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); @@ -233,7 +233,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib; for (h=0; h<hmax; h++){ - int j = h + b * ws->ib; + int j = h + b * ws->ib; INSERT_TASK_zgetrf_blocked_diag( options, @@ -250,26 +250,24 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, ipiv ); } - if ( (b < (nbblock-1)) && (h == hmax-1) ) { + assert( j <= minmn ); + /* Reduce globally (between MPI processes) */ + INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + + if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) { INSERT_TASK_zgetrf_blocked_trsm( options, - ws->ib, tempkn, b * ws->ib + hmax, ws->ib, + ws->ib, tempkn, j+1, ws->ib, Up(k, k), ipiv ); } - - assert( j<= minmn ); - if ( j < minmn ) { - /* Reduce globally (between MPI processes) */ - INSERT_TASK_ipiv_reducek( options, ipiv, k, j ); - } } } RUNTIME_data_flush( options->sequence, Up(k, k) ); /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); - RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } /* @@ -284,8 +282,8 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, { int m, h, b, nbblock, hmax, j; int tempkm, tempkn, tempmm, minmn; - void **clargs = malloc( sizeof(char *) * A->p ); - memset( clargs, 0, sizeof(char *) * A->p ); + void **clargs = malloc( sizeof(char *) ); + memset( clargs, 0, sizeof(char *) ); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -306,10 +304,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, for ( h = 0; h < hmax; h++ ) { j = h + b * ws->ib; - INSERT_TASK_zgetrf_panel_blocked_batched( options, tempkm, tempkn, j, k * A->mb, (void *)ws, - A(k, k), Up(k, k), clargs, ipiv ); - - for ( m = k + 1; m < A->mt; m++ ) { + for ( m = k; m < A->mt; m++ ) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb, (void *)ws, A(m, k), Up(k, k), clargs, ipiv ); @@ -317,6 +312,10 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, k, Up(k, k), clargs, ipiv ); + assert( j <= minmn ); + /* Reduce globally (between MPI processes) */ + INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + if ( (b < (nbblock-1)) && (h == hmax-1) ) { INSERT_TASK_zgetrf_blocked_trsm( options, @@ -324,12 +323,6 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, Up(k, k), ipiv ); } - - assert( j <= minmn ); - if ( j < minmn ) { - /* Reduce globally (between MPI processes) */ - INSERT_TASK_ipiv_reducek( options, ipiv, k, j ); - } } } @@ -337,7 +330,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); - RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } static inline void @@ -347,6 +340,26 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, int k, RUNTIME_option_t *options ) { +#if defined ( CHAMELEON_USE_MPI ) + int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) ); + int b; + + /* 2DBC only */ + ws->involved = 0; + for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) { + int rank = chameleon_getrankof_2d( A, b, k ); + proc_involved[ b-k ] = rank; + if ( rank == A->myrank ) { + ws->involved = 1; + } + } + ws->proc_involved = proc_involved; + if ( ws->involved == 0 ) { + free( proc_involved ); + return; + } +#endif + /* TODO: Should be replaced by a function pointer */ switch( ws->alg ) { case ChamGetrfNoPivPerColumn: @@ -354,7 +367,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPivPerColumn: - if ( ws->batch_size > 1 ) { + if ( ws->batch_size > 0 ) { chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options ); } else { @@ -363,7 +376,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPiv: - if ( ws->batch_size > 1 ) { + if ( ws->batch_size > 0 ) { chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options ); } else { @@ -376,6 +389,9 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, default: chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options ); } +#if defined ( CHAMELEON_USE_MPI ) + free( proc_involved ); +#endif } /** @@ -503,7 +519,9 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, * block column k. */ options.forcesub = chameleon_involved_in_panelk_2dbc( A, k ); - chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options ); + if ( chameleon_involved_in_panelk_2dbc( A, k ) ) { + chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options ); + } options.forcesub = 0; for (n = k+1; n < A->nt; n++) { diff --git a/compute/zgetrf.c b/compute/zgetrf.c index a94a05551e554645e23311a20a0f1d2ef7794dd5..8fb6734d3e15fe2cc25fb9c1664db8bc9a0f6987 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -19,6 +19,8 @@ * @author Florent Pruvost * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois + * @author Alycia Lisito + * @author Xavier Lacoste * @date 2024-03-16 * * @precisions normal z -> s d c @@ -88,15 +90,11 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( algostr ); } - ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 1 ); + ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) { chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); ws->batch_size = CHAMELEON_BATCH_SIZE; } - if ( (ws->batch_size > 1) && (CHAMELEON_Comm_rank() > 1) ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE is unavailable in distributed, value forced to 1\n" ); - ws->batch_size = 1; - } /* Allocation of U for permutation of the panels */ if ( ws->alg == ChamGetrfNoPivPerColumn ) { @@ -300,7 +298,7 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) if ( ( ws->alg == ChamGetrfPPivPerColumn ) || ( ws->alg == ChamGetrfPPiv ) ) { - chameleon_ipiv_destroy( &descIPIV ); + chameleon_ipiv_destroy( &descIPIV, &descAt ); } CHAMELEON_zgetrf_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); diff --git a/control/compute_z.h b/control/compute_z.h index 088e03140baff5b167727931a6fb9e6b7a1641f0..06c8854c1b05a7c1bffbca0fa8e615218363672d 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -48,6 +48,8 @@ struct chameleon_pzgetrf_s { int batch_size; /**< Batch size for the panel */ CHAM_desc_t U; CHAM_desc_t Up; + int *proc_involved; + int involved:1; }; /** diff --git a/control/descriptor.h b/control/descriptor.h index 306abe6c5d320076eac9ed7c06aa82d10926aa46..1e0315fae2c70cdec40052e49a58b47c32a46ec9 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -20,6 +20,7 @@ * @author Raphael Boucherie * @author Samuel Thibault * @author Lionel Eyraud-Dubois + * @author Alycia Lisito * @date 2023-08-22 * */ @@ -77,7 +78,7 @@ void chameleon_desc_destroy ( CHAM_desc_t *desc ); int chameleon_desc_check ( const CHAM_desc_t *desc ); int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data ); -void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv ); +void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc ); /** * Internal function to return address of block (m,n) with m,n = block indices diff --git a/control/descriptor_ipiv.c b/control/descriptor_ipiv.c index e9631909b89689df5498c29da368298d8753bc40..c3369b7a4126ea0b245eb73ed2d3b547f7f11523 100644 --- a/control/descriptor_ipiv.c +++ b/control/descriptor_ipiv.c @@ -12,6 +12,8 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito + * @author Florent Pruvost * @date 2024-03-16 * *** @@ -73,7 +75,7 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data ipiv->mt = chameleon_ceil( ipiv->m, ipiv->mb ); /* Create runtime specific structure like registering data */ - RUNTIME_ipiv_create( ipiv ); + RUNTIME_ipiv_create( ipiv, desc ); return rc; } @@ -91,9 +93,10 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data * The pointer to the ipiv descriptor to destroy. * */ -void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv ) +void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { - RUNTIME_ipiv_destroy( ipiv ); + RUNTIME_ipiv_destroy( ipiv, desc ); } /** @@ -162,7 +165,8 @@ int CHAMELEON_Ipiv_Create( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void * @retval CHAMELEON_SUCCESS successful exit * */ -int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr) +int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr, + const CHAM_desc_t *desc ) { CHAM_context_t *chamctxt; CHAM_ipiv_t *ipiv; @@ -179,7 +183,7 @@ int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr) } ipiv = *ipivptr; - chameleon_ipiv_destroy( ipiv ); + chameleon_ipiv_destroy( ipiv, desc ); free(ipiv); *ipivptr = NULL; return CHAMELEON_SUCCESS; diff --git a/include/chameleon.h b/include/chameleon.h index f1d33549595e475ee4bf60514bd08689d5416b40..12c295a7732ef73f1a1fac421bb38be6f0cdd9ea 100644 --- a/include/chameleon.h +++ b/include/chameleon.h @@ -18,6 +18,8 @@ * @author Florent Pruvost * @author Philippe Virouleau * @author Lionel Eyraud-Dubois + * @author Alycia Lisito + * @author Loris Lucido * @date 2024-03-16 * */ @@ -214,11 +216,16 @@ int CHAMELEON_Recursive_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flt blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd, blkrankof_fct_t get_rankof, void* get_rankof_arg ); -int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void *data ); -int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr ); +int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t **ipivptr, + const CHAM_desc_t *desc, + void *data ); +int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr, + const CHAM_desc_t *desc ); int CHAMELEON_Ipiv_Flush ( const CHAM_ipiv_t *ipiv, const RUNTIME_sequence_t *sequence ); -int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc, int *ipiv, int root ); +int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc, + int *ipiv, + int root ); void CHAMELEON_Ipiv_Print ( const CHAM_ipiv_t *ipiv ); /** diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h index e64390f6c2c16d3c6c730748be710075e6e70f21..52993c9a6a8130bc1727a74777511bd03a3f48f3 100644 --- a/include/chameleon/runtime.h +++ b/include/chameleon/runtime.h @@ -18,6 +18,7 @@ * @author Samuel Thibault * @author Philippe Swartvagher * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-16 * */ @@ -717,8 +718,10 @@ void RUNTIME_ddisplay_oneprofile (cham_tasktype_t task); void RUNTIME_sdisplay_allprofile (); void RUNTIME_sdisplay_oneprofile (cham_tasktype_t task); -void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv ); -void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ); +void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ); +void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ); void RUNTIME_ipiv_gather ( const RUNTIME_sequence_t *sequence, CHAM_ipiv_t *desc, int *ipiv, int node ); @@ -730,18 +733,18 @@ void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, const CHAM_ipiv_t *ipiv, int m ); void *RUNTIME_ipiv_getaddr ( const CHAM_ipiv_t *ipiv, int m ); -void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ); -void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ); +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h ); +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h ); void *RUNTIME_perm_getaddr ( const CHAM_ipiv_t *ipiv, int m ); void *RUNTIME_invp_getaddr ( const CHAM_ipiv_t *ipiv, int m ); static inline void * -RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) { +RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int rank, int k, int h ) { if ( h%2 == 0 ) { - return RUNTIME_nextpiv_getaddr( ipiv, m, -1 ); + return RUNTIME_nextpiv_getaddr( ipiv, rank, k, h ); } else { - return RUNTIME_prevpiv_getaddr( ipiv, m, -1 ); + return RUNTIME_prevpiv_getaddr( ipiv, rank, k, h ); } } diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index aa21e99d8f85c82b9484da1aa7d599d995b66bb4..99d70dbade30332f9af8ce5397636f8023a10e24 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -16,6 +16,7 @@ * @author Cedric Augonnet * @author Florent Pruvost * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-16 * */ @@ -165,7 +166,7 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options, void INSERT_TASK_ipiv_init ( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv ); void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, - CHAM_ipiv_t *ws, int k, int h ); + CHAM_ipiv_t *ws, int k, int h, int rank ); void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, int m0, int m, int k, const CHAM_ipiv_t *ipivdesc, int ipivk ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 795ebd2d186f9c1e88a44ab6312d40583b1a4d5d..93a5f6e303c8b00076e78fbe7faf58fc59dfe4f7 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -562,4 +562,12 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index f7203fbe500d517ea64251ea198600944ce9291c..08279345b7f5d95ba633f3143f7c1b39fe2d6352 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -24,6 +24,7 @@ # @author Florent Pruvost # @author Philippe Virouleau # @author Matthieu Kuhn +# @author Alycia Lisito # @date 2024-03-16 # ### @@ -73,6 +74,7 @@ set(CODELETS_ZSRC codelets/codelet_zhe2ge.c codelets/codelet_zherfb.c codelets/codelet_zhessq.c + codelets/codelet_zipiv_allreduce.c codelets/codelet_zlacpy.c codelets/codelet_zlange.c codelets/codelet_zlanhe.c diff --git a/runtime/openmp/codelets/codelet_ipiv.c b/runtime/openmp/codelets/codelet_ipiv.c index d6386bb58d09d584cc066371cb18bd6be3fad3b7..548d688fb10388791d18df0dbd495ee9e969a01a 100644 --- a/runtime/openmp/codelets/codelet_ipiv.c +++ b/runtime/openmp/codelets/codelet_ipiv.c @@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, } void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, int k, int h ) + CHAM_ipiv_t *ipiv, int k, int h, int rank ) { assert( 0 ); (void)options; (void)ipiv; (void)k; (void)h; + (void)rank; } void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..b088283254cd64e1bada1628939436327b8a2789 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c @@ -0,0 +1,35 @@ +/** + * + * @file openmp/codelet_zipiv_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon openmp codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" + +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + (void)A; + (void)options; + (void)ipiv; + (void)proc_involved; + (void)k; + (void)h; + (void)n; +} diff --git a/runtime/openmp/control/runtime_descriptor_ipiv.c b/runtime/openmp/control/runtime_descriptor_ipiv.c index 9514b6fd067af22b09d4c224e01eb0c87f4e4de1..3a727f01c3ae30eef51ee4273096b704d9e8e777 100644 --- a/runtime/openmp/control/runtime_descriptor_ipiv.c +++ b/runtime/openmp/control/runtime_descriptor_ipiv.c @@ -12,21 +12,27 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito + * @author Florent Pruvost * @date 2024-03-16 * */ #include "chameleon_openmp.h" -void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } -void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) @@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; } -void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; diff --git a/runtime/parsec/codelets/codelet_ipiv.c b/runtime/parsec/codelets/codelet_ipiv.c index b9ac7e05468ba805c97dac09f75ef1c37c63f928..46fee3ee85ac11a6a6cac20febfdd2f6ddde9712 100644 --- a/runtime/parsec/codelets/codelet_ipiv.c +++ b/runtime/parsec/codelets/codelet_ipiv.c @@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, } void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, int k, int h ) + CHAM_ipiv_t *ipiv, int k, int h, int rank ) { assert( 0 ); (void)options; (void)ipiv; (void)k; (void)h; + (void)rank; } static inline int diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..75e0611647a464cad9c37e59a5619ebefaae19ed --- /dev/null +++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c @@ -0,0 +1,35 @@ +/** + * + * @file parsec/codelet_zipiv_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon parsec codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" + +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + (void)A; + (void)options; + (void)ipiv; + (void)proc_involved; + (void)k; + (void)h; + (void)n; +} diff --git a/runtime/parsec/control/runtime_descriptor_ipiv.c b/runtime/parsec/control/runtime_descriptor_ipiv.c index 53621950fff15975835e686fabc48c6fd3e7d9e4..6108199eb0fb89ffdb359afa48b725e79de86fdf 100644 --- a/runtime/parsec/control/runtime_descriptor_ipiv.c +++ b/runtime/parsec/control/runtime_descriptor_ipiv.c @@ -12,21 +12,27 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito + * @author Florent Pruvost * @date 2024-03-16 * */ #include "chameleon_parsec.h" -void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } -void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) @@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; } -void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; diff --git a/runtime/quark/codelets/codelet_ipiv.c b/runtime/quark/codelets/codelet_ipiv.c index ab982faf04523af148f8e8444a6469681a4a0ea6..5fc849b890a75f3447c425f1a458d11fd0c3df1c 100644 --- a/runtime/quark/codelets/codelet_ipiv.c +++ b/runtime/quark/codelets/codelet_ipiv.c @@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, } void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, int k, int h ) + CHAM_ipiv_t *ipiv, int k, int h, int rank ) { assert( 0 ); (void)options; (void)ipiv; (void)k; (void)h; + (void)rank; } static inline void diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..e88269e931f3f210282a1382d44a6ff9516c7453 --- /dev/null +++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c @@ -0,0 +1,35 @@ +/** + * + * @file quark/codelet_zipiv_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon quark codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" + +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + (void)A; + (void)options; + (void)ipiv; + (void)proc_involved; + (void)k; + (void)h; + (void)n; +} diff --git a/runtime/quark/control/runtime_descriptor_ipiv.c b/runtime/quark/control/runtime_descriptor_ipiv.c index 9edd6f041c266cbd6c51cea48c359d283f3f619a..f5fa28fe5a6246a3e91476d574f9e771d162d40f 100644 --- a/runtime/quark/control/runtime_descriptor_ipiv.c +++ b/runtime/quark/control/runtime_descriptor_ipiv.c @@ -12,21 +12,27 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito + * @author Florent Pruvost * @date 2024-03-16 * */ #include "chameleon_quark.h" -void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } -void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( 0 ); (void)ipiv; + (void)desc; } void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) @@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; } -void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h ) { assert( 0 ); (void)ipiv; + (void)rank; (void)m; (void)h; return NULL; diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 7f2eab94cd34acadf580ed42f5c858562d927ae4..f01a36b14433bdac07cd795fe24cb865ee0cd228 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -33,6 +33,7 @@ cmake_minimum_required(VERSION 3.5) include(CheckSymbolExists) include(CheckStructHasMember) +include(CheckCSourceRuns) set(CHAMELEON_STARPU_VERSION "1.3" CACHE STRING "necessary STARPU API version") @@ -90,6 +91,25 @@ if ( STARPU_FOUND ) message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}") endif() + # Check if STARPU_NONE equals 0 or not + set(C_STARPU_NONE_NONZERO " +#include <stdio.h> +#include <stdint.h> +#include <starpu.h> +int main() { + if (STARPU_NONE == 0) + return 1; + else + return 0; +} +") + + unset(HAVE_STARPU_NONE_NONZERO CACHE) + check_c_source_runs("${C_STARPU_NONE_NONZERO}" HAVE_STARPU_NONE_NONZERO) + if ( HAVE_STARPU_NONE_NONZERO ) + message("-- ${Blue}Add definition HAVE_STARPU_NONE_NONZERO${ColourReset}") + endif() + if (CHAMELEON_USE_MPI) # Add MPI in case StarPU don't have a public dependency on it check_function_exists(starpu_mpi_init_conf HAVE_STARPU_MPI_INIT_CONF) diff --git a/runtime/starpu/codelets/codelet_ipiv.c b/runtime/starpu/codelets/codelet_ipiv.c index 64e6031391793de8dc829e2ac47eddabfdba7be5..e5dba252a6312d625a825485cc84d0657973f435 100644 --- a/runtime/starpu/codelets/codelet_ipiv.c +++ b/runtime/starpu/codelets/codelet_ipiv.c @@ -12,6 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-16 * */ @@ -62,13 +63,13 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, } void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, int k, int h ) + CHAM_ipiv_t *ipiv, int k, int h, int rank ) { - starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, k, h-1 ); + starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h-1 ); #if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI) #if !defined(HAVE_STARPU_MPI_REDUX_WRAPUP) - starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, k, h ); + starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h ); if ( h < ipiv->n ) { starpu_mpi_redux_data_prio_tree( options->sequence->comm, nextpiv, options->priority, 2 /* Binary tree */ ); diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 1d4cb37da9bc6099305ddcf9eb4516fb17feaf52..d9c55d76cd3fa290ab004ebc854e3d5f4638cf93 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -43,15 +43,16 @@ cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[], void *cl_arg ) { struct cl_getrf_batched_args_t *clargs = (struct cl_getrf_batched_args_t *) cl_arg; - cppi_interface_t *nextpiv = (cppi_interface_t*) descr[0]; - cppi_interface_t *prevpiv = (cppi_interface_t*) descr[1]; + cppi_interface_t *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ]; + cppi_interface_t *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ]; int i, m, n, h, m0, lda; CHAM_tile_t *tileA; nextpiv->h = clargs->h; + nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag ); for ( i = 0; i < clargs->tasks_nbr; i++ ) { - tileA = cti_interface_get( descr[ i + 2 ] ); + tileA = cti_interface_get( descr[ i ] ); lda = tileA->ld; m = clargs->m[ i ]; n = clargs->n[ i ]; @@ -77,6 +78,15 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; void (*callback)(void*) = NULL; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + int rankA = A->get_rankof( A, Am, An ); + if ( rankA != A->myrank ) { + return; + } +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -85,6 +95,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, if ( clargs == NULL ) { clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ; + memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) ); clargs->tasks_nbr = 0; clargs->h = h; clargs->cl_name = "zgetrf_panel_offdiag_batched"; @@ -104,13 +115,15 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); if ( clargs->tasks_nbr == batch_size ) { + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; rt_starpu_insert_task( &cl_zgetrf_panel_offdiag_batched, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), - STARPU_REDUX, RUNTIME_pivot_getaddr( ipiv, An, h ), - STARPU_R, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -132,18 +145,26 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, { void (*callback)(void*) = NULL; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + int rankA = A->myrank; +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif if ( clargs == NULL ) { return; } + int access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( clargs->h == 0 ) ? STARPU_NONE : STARPU_R; rt_starpu_insert_task( &cl_zgetrf_panel_offdiag_batched, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), - STARPU_REDUX, RUNTIME_pivot_getaddr( ipiv, An, clargs->h ), - STARPU_R, RUNTIME_pivot_getaddr( ipiv, An, clargs->h-1 ), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -162,20 +183,27 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[], void *cl_arg ) { struct cl_getrf_batched_args_t *clargs = ( struct cl_getrf_batched_args_t * ) cl_arg; - int *ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr]); - cppi_interface_t *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1]; - cppi_interface_t *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 2]; + int *ipiv; + cppi_interface_t *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ]; + cppi_interface_t *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1]; int i, h, ib; CHAM_tile_t *tileA, *tileU; CHAMELEON_Complex64_t *U = NULL; int ldu = -1; nextpiv->h = clargs->h; + nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); h = clargs->h; ib = clargs->ib; i = 0; if ( clargs->diag ) { + if ( h == 0 ) { + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 1]); + } + else { + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 2]); + } if ( h != 0 ) { tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] ); U = CHAM_tile_get_ptr( tileU ); @@ -190,7 +218,7 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[], i++; } if ( ( h%ib == 0 ) && ( h > 0 ) ) { - tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] ); + tileU = cti_interface_get( descr[ clargs->tasks_nbr + 2 + clargs->diag ] ); U = CHAM_tile_get_ptr( tileU ); ldu = tileU->ld; } @@ -225,6 +253,28 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, void (*callback)(void*) = NULL; int accessU, access_npiv, access_ipiv, access_ppiv; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + int rankA = A->get_rankof(A, Am, An); +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif + +#if defined ( CHAMELEON_USE_MPI ) + if ( ( Am == An ) && ( h % ib == 0 ) && ( h > 0 ) ) { + starpu_mpi_cache_flush( options->sequence->comm, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) ); + } + + if ( rankA != A->myrank ) { + if ( ( h % ib == 0 ) && ( h > 0 ) && ( A->myrank == A->get_rankof( A, An, An ) ) ) { + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + rankA, NULL, NULL ); + } + return; + } +#endif /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -232,7 +282,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, CHAMELEON_END_ACCESS_DECLARATION; if ( clargs == NULL ) { - clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ; + clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ); + memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) ); clargs->tasks_nbr = 0; clargs->diag = ( Am == An ); clargs->ib = ib; @@ -271,24 +322,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, } /* If there isn't a diag task then use offdiag access */ if ( clargs->diag == 0 ) { - accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + access_ipiv = STARPU_NONE; } rt_starpu_insert_task( &cl_zgetrf_panel_blocked_batched, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, clargs->cl_name, #endif - STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), - accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), 0); /* clargs is freed by starpu. */ @@ -306,6 +358,12 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, int accessU, access_npiv, access_ipiv, access_ppiv; void (*callback)(void*) = NULL; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + int rankA = A->myrank; +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif if ( clargs == NULL ) { return; @@ -328,24 +386,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, } /* If there isn't a diag task then use offdiag access */ if ( clargs->diag == 0 ) { - accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE; + accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE; + access_ipiv = STARPU_NONE; } rt_starpu_insert_task( &cl_zgetrf_panel_blocked_batched, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, clargs->cl_name, #endif - STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, clargs->h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, clargs->h - 1 ), - accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), 0); /* clargs is freed by starpu. */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c index 2c6daa18d9bda1f7ff433305aa98ad77f648b4b5..8739f27deb22f8ba019fa85338c4fdcbc0a0d789 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -14,6 +14,7 @@ * * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-11 * @precisions normal z -> c d s * @@ -67,6 +68,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; nextpiv->has_diag = 1; + coreblas_kernel_trace( tileA ); CORE_zgetrf_panel_diag( m, n, h, m0, ib, CHAM_tile_get_ptr( tileA ), tileA->ld, U, ldu, @@ -95,6 +97,22 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag; void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; const char *cl_name = "zgetrf_blocked_diag"; + int rankA = A->get_rankof(A, Am, An); +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif + +#if defined ( CHAMELEON_USE_MPI ) + if ( ( h % ib == 0 ) && ( h > 0 ) ) { + starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) ); + } + + if ( rankA != A->myrank ) { + return; + } +#endif int access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; @@ -108,7 +126,7 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, else if ( h%ib == 0 ) { accessU = STARPU_R; } - else if ( h%ib == 1 ) { + else if ( ( h%ib == 1 ) || ( ib == 1 ) ) { accessU = STARPU_W; } @@ -130,25 +148,24 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif - /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), - accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), 0); } #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, m0, ib; + int m, n, h, k, m0, ib; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; CHAM_tile_t *tileA; @@ -156,9 +173,9 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) cppi_interface_t *nextpiv; cppi_interface_t *prevpiv; CHAMELEON_Complex64_t *U = NULL; - int ldu = -1;; + int ldu = -1; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, &sequence, &request ); + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request ); tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; @@ -169,12 +186,28 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) ldu = tileU->ld; } + if ( h > 0 ) { + cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " ); + } + if ( h < tileA->n ) { + cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " ); + } + nextpiv->h = h; /* Initialize in case it uses a copy */ + nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); + coreblas_kernel_trace( tileA ); CORE_zgetrf_panel_offdiag( m, n, h, m0, ib, CHAM_tile_get_ptr(tileA), tileA->ld, U, ldu, &(nextpiv->pivot), &(prevpiv->pivot) ); + + if ( h > 0 ) { + cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " ); + } + if ( h < tileA->n ) { + cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " ); + } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -190,9 +223,29 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv ) { struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag; + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + int rankA = A->get_rankof(A, Am, An); +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif + +#if defined ( CHAMELEON_USE_MPI ) + if ( rankA != A->myrank ) { + if ( ( accessU != STARPU_NONE ) && + ( A->myrank == A->get_rankof( A, An, An ) ) ) + { + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + rankA, NULL, NULL ); + } + return; + } +#endif void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL; const char *cl_name = "zgetrf_blocked_offdiag"; @@ -200,6 +253,9 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW( A, Am, An ); + if ((h%ib == 0) && (h > 0)) { + CHAMELEON_ACCESS_R( U, Um, Un ); + } CHAMELEON_END_ACCESS_DECLARATION; /* Refine name */ @@ -211,21 +267,21 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &h, sizeof(int), + STARPU_VALUE, &An, sizeof(int), STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif - /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), - accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), 0); } @@ -247,6 +303,8 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) U = CHAM_tile_get_ptr( tileU ); ldu = tileU->ld; + coreblas_kernel_trace( tileU ); + /* Copy the final max line of the block and solve */ cblas_zcopy( n, prevpiv->pivot.pivrow, 1, U + m - 1, ldu ); @@ -276,6 +334,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL; const char *cl_name = "zgetrf_blocked_trsm"; + int rankU = U->get_rankof(U, Um, Un); /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -286,6 +345,10 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, cl_name = chameleon_codelet_name( cl_name, 1, U->get_blktile( U, Um, Un ) ); + if ( U->myrank != U->get_rankof(U, Um, Un) ) { + return; + } + rt_starpu_insert_task( codelet, STARPU_VALUE, &m, sizeof(int), @@ -293,7 +356,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, STARPU_VALUE, &h, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), - STARPU_R, RUNTIME_pivot_getaddr( ipiv, Un, h-1 ), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c index 5d3f83b6ce046a72135c8f513c8cc23822159595..0b556f81605a9cc78faea6fa6e312ffc0e643631 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_percol.c +++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c @@ -14,6 +14,7 @@ * * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2024-03-11 * @precisions normal z -> c d s * @@ -84,6 +85,17 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_zgetrf_percol_diag; void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_diag_callback : NULL; const char *cl_name = "zgetrf_percol_diag"; + int rankA = A->get_rankof(A, Am, An); + +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif + + if ( rankA != A->myrank ) { + return; + } int access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; @@ -95,8 +107,7 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, CHAMELEON_END_ACCESS_DECLARATION; /* Refine name */ - cl_name = chameleon_codelet_name( cl_name, 1, - A->get_blktile( A, Am, An ) ); + cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( codelet, @@ -106,17 +117,16 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif - /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), 0); } @@ -137,6 +147,7 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) prevpiv = (cppi_interface_t*) descr[2]; nextpiv->h = h; /* Initialize in case it uses a copy */ + nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n, CHAM_tile_get_ptr(tileA), tileA->ld, @@ -159,6 +170,18 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_offdiag_callback : NULL; const char *cl_name = "zgetrf_percol_offdiag"; + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int rankA = A->get_rankof(A, Am, An); +#if !defined(HAVE_STARPU_NONE_NONZERO) + /* STARPU_NONE can't be equal to 0 */ + fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + assert( 0 ); +#endif + + if ( rankA != A->myrank ) { + return; + } /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -166,8 +189,7 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, CHAMELEON_END_ACCESS_DECLARATION; /* Refine name */ - cl_name = chameleon_codelet_name( cl_name, 1, - A->get_blktile( A, Am, An ) ); + cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( codelet, @@ -178,8 +200,8 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_REDUX, RUNTIME_pivot_getaddr( ipiv, An, h ), - STARPU_R, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..13a41ceb04be76b2f89419a20bd6209d3aebd6e3 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -0,0 +1,169 @@ +/** + * + * @file starpu/codelet_zipiv_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +#if defined(CHAMELEON_USE_MPI) +struct cl_redux_args_t { + int h; + int n; + int k; +}; + +static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); + cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); + CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); + CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); + CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); + CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); + + cppi_display_dbg( cppi_me, stderr, "Global redux Inout: "); + cppi_display_dbg( cppi_src, stderr, "Global redux Input: "); + + assert( cppi_me->n == cppi_src->n ); + assert( cppi_me->h == cppi_src->h ); + assert( cppi_me->flttype == cppi_src->flttype ); + assert( cppi_me->arraysize == cppi_src->arraysize ); + + if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) { + nextpiv_me->blkm0 = nextpiv_src->blkm0; + nextpiv_me->blkidx = nextpiv_src->blkidx; + cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 ); + } + + /* Let's copy the diagonal row if needed */ + if ( ( cppi_src->has_diag == 1 ) && + ( cppi_me->has_diag == -1 ) ) + { + cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); + assert( cppi_src->arraysize == clargs->n * sizeof(CHAMELEON_Complex64_t) ); + cppi_me->has_diag = 1; + } + + cppi_display_dbg( cppi_me, stderr, "Global redux Inout(After): "); +} + +CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) + +void +INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h, + const RUNTIME_option_t *options ) +{ + rt_starpu_insert_task( + NULL, + STARPU_EXECUTE_ON_NODE, dst, + STARPU_R, RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_PRIORITY, options->priority, + 0 ); +} + +void +INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n, + const RUNTIME_option_t *options ) +{ + struct cl_redux_args_t *clargs; + clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs->h = h; + clargs->n = n; + clargs->k = k; + + rt_starpu_insert_task( + &cl_zipiv_allreduce, + STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_t), + STARPU_RW, RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, src, k, h ), + STARPU_EXECUTE_ON_NODE, me, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_PRIORITY, options->priority, + 0 ); + starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); +} + +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + int np_involved = chameleon_min( A->p, A->mt - k); + int np_iter = np_involved; + int p_recv, p_send, me; + int shift = 1; + + if ( h > 0 ) { + starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); + } + if ( h >= ipiv->n ) { + return; + } + + if ( np_involved == 1 ) { + assert( proc_involved[0] == A->myrank ); + } + else { + for( me = 0; me < np_involved; me++ ) { + if ( proc_involved[me] == A->myrank ) { + break; + } + } + assert( me < np_involved ); + while ( np_iter > 1 ) { + p_send = proc_involved[ ( me + shift ) % np_involved ]; + p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; + + INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h, options ); + INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options ); + + shift = shift << 1; + np_iter = chameleon_ceil( np_iter, 2 ); + } + } +} +#else +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + if ( h > 0 ) { + starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); + } + + (void)options; + (void)proc_involved; + (void)n; +} +#endif diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c index 48be66e17652b487c246c2eec0dd8211d7890b36..1ad0f7a142fd9272a3ffb445dd797db774959d60 100644 --- a/runtime/starpu/control/runtime_descriptor_ipiv.c +++ b/runtime/starpu/control/runtime_descriptor_ipiv.c @@ -12,6 +12,8 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito + * @author Florent Pruvost * @date 2024-03-16 * */ @@ -20,16 +22,18 @@ /** * Create ws_pivot runtime structures */ -void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { assert( ipiv ); - starpu_data_handle_t *handles = calloc( 5 * ipiv->mt, sizeof(starpu_data_handle_t) ); + size_t nbhandles = 3 * ipiv->mt + 2 * desc->p; + starpu_data_handle_t *handles = calloc( nbhandles, sizeof(starpu_data_handle_t) ); ipiv->ipiv = handles; handles += ipiv->mt; ipiv->nextpiv = handles; - handles += ipiv->mt; + handles += desc->p; ipiv->prevpiv = handles; - handles += ipiv->mt; + handles += desc->p; ipiv->perm = handles; handles += ipiv->mt; ipiv->invp = handles; @@ -40,14 +44,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) */ { chameleon_starpu_tag_init(); - ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 5 ); + ipiv->mpitag_ipiv = chameleon_starpu_tag_book( nbhandles ); if ( ipiv->mpitag_ipiv == -1 ) { chameleon_fatal_error("RUNTIME_ipiv_create", "Can't pursue computation since no more tags are available for ipiv structure"); return; } ipiv->mpitag_nextpiv = ipiv->mpitag_ipiv + ipiv->mt; - ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + ipiv->mt; - ipiv->mpitag_perm = ipiv->mpitag_prevpiv + ipiv->mt; + ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + desc->p; + ipiv->mpitag_perm = ipiv->mpitag_prevpiv + desc->p; ipiv->mpitag_invp = ipiv->mpitag_perm + ipiv->mt; } #endif @@ -56,12 +60,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) /** * Destroy ws_pivot runtime structures */ -void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) +void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv, + const CHAM_desc_t *desc ) { int i; starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv); + size_t nbhandles = 3 * ipiv->mt + 2 * desc->p; - for(i=0; i<(5 * ipiv->mt); i++) { + for(i=0; i<nbhandles; i++) { if ( *handle != NULL ) { starpu_data_unregister( *handle ); *handle = NULL; @@ -107,49 +113,51 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) return (void*)(*handle); } -void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h ) { starpu_data_handle_t *nextpiv = (starpu_data_handle_t*)(ipiv->nextpiv); - int64_t mm = m + (ipiv->i / ipiv->mb); + const CHAM_desc_t *A = ipiv->desc; - nextpiv += mm; + nextpiv += rank/A->q; assert( nextpiv ); if ( *nextpiv != NULL ) { return (void*)(*nextpiv); } - const CHAM_desc_t *A = ipiv->desc; - int owner = A->get_rankof( A, m, m ); - int ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb; - int64_t tag = ipiv->mpitag_nextpiv + mm; + int64_t kk = k + (ipiv->i / ipiv->mb); + int owner = rank; + int ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb; + int64_t tag = ipiv->mpitag_nextpiv + owner/A->q; cppi_register( nextpiv, A->dtyp, ncols, tag, owner ); assert( *nextpiv ); + (void)h; return (void*)(*nextpiv); } -void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h ) { starpu_data_handle_t *prevpiv = (starpu_data_handle_t*)(ipiv->prevpiv); - int64_t mm = m + (ipiv->i / ipiv->mb); + const CHAM_desc_t *A = ipiv->desc; - prevpiv += mm; + prevpiv += rank/A->q; assert( prevpiv ); if ( *prevpiv != NULL ) { return (void*)(*prevpiv); } - const CHAM_desc_t *A = ipiv->desc; - int owner = A->get_rankof( A, m, m ); - int ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb; - int64_t tag = ipiv->mpitag_prevpiv + mm; + int64_t kk = k + (ipiv->i / ipiv->mb); + int owner = rank; + int ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb; + int64_t tag = ipiv->mpitag_prevpiv + owner/A->q; cppi_register( prevpiv, A->dtyp, ncols, tag, owner ); assert( *prevpiv ); + (void)h; return (void*)(*prevpiv); } @@ -212,19 +220,18 @@ void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m ) } void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, - const CHAM_ipiv_t *ipiv, int m ) + const CHAM_ipiv_t *ipiv, int rank ) { starpu_data_handle_t *handle; const CHAM_desc_t *A = ipiv->desc; - int64_t mm = m + ( ipiv->i / ipiv->mb ); handle = (starpu_data_handle_t*)(ipiv->nextpiv); - handle += mm; + handle += rank/A->q; if ( *handle != NULL ) { #if defined(CHAMELEON_USE_MPI) starpu_mpi_cache_flush( sequence->comm, *handle ); - if ( starpu_mpi_data_get_rank( *handle ) == A->myrank ) + if ( starpu_mpi_data_get_rank( *handle ) == rank ) #endif { chameleon_starpu_data_wont_use( *handle ); @@ -232,12 +239,12 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, } handle = (starpu_data_handle_t*)(ipiv->prevpiv); - handle += mm; + handle += rank/A->q; if ( *handle != NULL ) { #if defined(CHAMELEON_USE_MPI) starpu_mpi_cache_flush( sequence->comm, *handle ); - if ( starpu_mpi_data_get_rank( *handle ) == A->myrank ) + if ( starpu_mpi_data_get_rank( *handle ) == rank ) #endif { chameleon_starpu_data_wont_use( *handle ); @@ -246,7 +253,7 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, (void)sequence; (void)ipiv; - (void)m; + (void)rank; } void RUNTIME_ipiv_flush( const RUNTIME_sequence_t *sequence, diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 41949dfbb7c345050a5260b47646276c7af57002..6c4632da84520449a2e2c9f96fedef2209d196e9 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -40,6 +40,7 @@ #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE #cmakedefine HAVE_STARPU_PARALLEL_WORKER +#cmakedefine HAVE_STARPU_NONE_NONZERO #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h index 7dbd10118c6bee637c9c49b6bc5bdf9d3fc008e2..8113c453fb2d344bda5972430e9e8337a3b0f880 100644 --- a/runtime/starpu/include/cppi_interface.h +++ b/runtime/starpu/include/cppi_interface.h @@ -12,6 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2023-08-22 * */ @@ -44,6 +45,7 @@ struct cppi_interface_s void cppi_interface_init(); void cppi_interface_fini(); +CHAM_pivot_t *cppi_handle_get( starpu_data_handle_t handle ); void cppi_register( starpu_data_handle_t *handleptr, cham_flttype_t flttype, int n, @@ -61,13 +63,14 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title ) diagrow = cppi_interface->pivot.diagrow; pivrow = cppi_interface->pivot.pivrow; - fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d\n", + fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d, interf = %p\n", title, cppi_interface->n, cppi_interface->h, cppi_interface->has_diag, cppi_interface->pivot.blkm0, - cppi_interface->pivot.blkidx ); + cppi_interface->pivot.blkidx, + cppi_interface ); fprintf(stderr, "Diagonal row: " ); for( i=0; i<cppi_interface->n; i++) { diff --git a/runtime/starpu/interface/cppi_interface.c b/runtime/starpu/interface/cppi_interface.c index 2d1754ec1cee030c040ee1c941cf283a6f58b284..6b1f8063180e78dbebf1ea443ee28f98920a7723 100644 --- a/runtime/starpu/interface/cppi_interface.c +++ b/runtime/starpu/interface/cppi_interface.c @@ -12,13 +12,14 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2023-08-22 * */ #include "chameleon_starpu.h" #undef HAVE_STARPU_REUSE_DATA_ON_NODE -static inline CHAM_pivot_t * +CHAM_pivot_t * cppi_handle_get( starpu_data_handle_t handle ) { cppi_interface_t *cppi_interface = (cppi_interface_t *) @@ -38,7 +39,7 @@ cppi_init( void *data_interface ) cppi_interface_t *cppi_interface = (cppi_interface_t *)data_interface; cppi_interface->id = CPPI_INTERFACE_ID; cppi_interface->h = -1; - cppi_interface->has_diag = 0; + cppi_interface->has_diag = -1; } static void @@ -83,7 +84,7 @@ cppi_allocate_data_on_node( void *data_interface, unsigned node ) /* update the data properly in consequence */ cppi_interface->h = -1; - cppi_interface->has_diag = 0; + cppi_interface->has_diag = -1; cppi_interface->pivot.pivrow = dataptr; cppi_interface->pivot.diagrow = ((char*)dataptr) + cppi_interface->arraysize; @@ -279,8 +280,10 @@ cppi_describe( void *data_interface, char *buf, size_t size ) { cppi_interface_t *cppi_interface = (cppi_interface_t *) data_interface; - return snprintf( buf, size, "Pivot structure, n %d, blkm0 %d, blkidx %d", + return snprintf( buf, size, "Pivot structure, n %d, h %d, has_diag = %d, blkm0 %d, blkidx %d", cppi_interface->n, + cppi_interface->h, + cppi_interface->has_diag, cppi_interface->pivot.blkm0, cppi_interface->pivot.blkidx ); } @@ -298,6 +301,7 @@ cppi_copy_any_to_any( void *src_interface, unsigned src_node, STARPU_ASSERT( cppi_interface_src->flttype == cppi_interface_dst->flttype ); cppi_interface_dst->h = cppi_interface_src->h; + cppi_interface_dst->has_diag = cppi_interface_src->has_diag; cppi_interface_dst->pivot.blkm0 = cppi_interface_src->pivot.blkm0; cppi_interface_dst->pivot.blkidx = cppi_interface_src->pivot.blkidx; @@ -402,8 +406,8 @@ cl_cppi_redux_cpu_func(void *descr[], void *cl_arg) assert( cppi_redux->h == cppi_input->h ); /* Let's copy the diagonal row if needed */ - if ( cppi_input->has_diag ) { - assert( cppi_redux->has_diag == 0 ); + if ( cppi_input->has_diag == 1 ) { + assert( cppi_redux->has_diag == -1 ); memcpy( cppi_redux->pivot.diagrow, cppi_input->pivot.diagrow, @@ -449,7 +453,7 @@ cl_cppi_init_redux_cpu_func( void *descr[], void *cl_arg ) cppi_interface_t *cppi_redux = ((cppi_interface_t *) descr[0]); /* Redux pivot never has diagonal at initialization */ - cppi_redux->has_diag = 0; + cppi_redux->has_diag = -1; cppi_redux->h = -1; size_t size = cppi_redux->arraysize; @@ -497,7 +501,7 @@ cppi_register( starpu_data_handle_t *handleptr, .id = CPPI_INTERFACE_ID, .arraysize = n * CHAMELEON_Element_Size( flttype ), .flttype = flttype, - .has_diag = 0, + .has_diag = -1, .h = -1, .n = n, }; diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index a1b637f681ed0bb82a981e65cd26310b03b514b7..c185e50b525c719a7b62422f89d5d5b9a259c435 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -88,28 +88,25 @@ if (NOT CHAMELEON_SIMULATION) if ( CHAMELEON_SCHED_STARPU ) add_test( test_${cat}_${prec}getrf_nopivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 --diag=ChamUnit -f input/getrf_nopiv.in ) set_tests_properties( test_${cat}_${prec}getrf_nopivpercol - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" ) + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" ) - add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) - set_tests_properties( test_${cat}_${prec}getrf_ppivpercol - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" ) + if ( HAVE_STARPU_NONE_NONZERO ) + add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppivpercol + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" ) - if ( ${cat} STREQUAL "shm" ) add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" ) - endif() - add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) - set_tests_properties( test_${cat}_${prec}getrf_ppiv - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=1" ) + add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppiv + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" ) - if ( ${cat} STREQUAL "shm" ) add_test( test_${cat}_${prec}getrf_ppiv_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppiv_batch - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" ) + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" ) endif() - endif() list( REMOVE_ITEM TESTSTMP print gepdf_qr ) diff --git a/testing/testing_zgetrf.c b/testing/testing_zgetrf.c index dc978bc6f13b224ebe19fdaf3a653ad4e09cd56f..4645631a7a86a72e5c8fe2fa5f8b40e61991e66c 100644 --- a/testing/testing_zgetrf.c +++ b/testing/testing_zgetrf.c @@ -151,8 +151,8 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) CHAMELEON_zgetrf_WS_Free( ws ); } + CHAMELEON_Ipiv_Destroy( &descIPIV, descA ); parameters_desc_destroy( &descA ); - CHAMELEON_Ipiv_Destroy( &descIPIV ); return hres; }