diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index ee148020a6833c8ad44a19f4e9f51c708d4af1d2..5d3f25e30fa1bc324eb1c4e796aeabb6857fe931 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -476,10 +476,10 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, ipiv, k, A(k, n), Wu(A->myrank, n) ); for(m=k+1; m<A->mt; m++){ - INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, (void *)ws, ipiv, k, + INSERT_TASK_zlaswp_batched( options, ChamDirForward, m*A->mb, minmn, (void *)ws->laswp, ipiv, k, A(m, n), A(k, n), Wu(A->myrank, n), clargs ); } - INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); + INSERT_TASK_zlaswp_batched_flush( options, ChamDirForward, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); INSERT_TASK_zperm_allreduce_row( options, ChamDirForward, A, Wu(A->myrank, n), ipiv, k, k, n, ws->laswp ); @@ -515,7 +515,7 @@ chameleon_pzgetrf_panel_permute_forward( struct chameleon_pzgetrf_s *ws, } #endif - if ( ws->batch_size_swap > 0 ) { + if ( ws->laswp->batch_size_swap > 0 ) { chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); } else { @@ -550,7 +550,7 @@ chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws, } #endif - if ( ws->batch_size_swap > 0 ) { + if ( ws->laswp->batch_size_swap > 0 ) { chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); } else { diff --git a/compute/pzlaswp.c b/compute/pzlaswp.c index 2afedfd5a0bf6f0bf1fb38567f4043fcfdc7b5ea..ed672ae397855138aa88e9c69497003238994920 100644 --- a/compute/pzlaswp.c +++ b/compute/pzlaswp.c @@ -7,11 +7,9 @@ * *** * - * @brief Chameleon zlaswp parallel algorithm + * @brief Chameleon zlaswp parallel algorithm for row permutation. * * @version 1.3.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Alycia Lisito * @author Matteo Marcos * @date 2025-03-24 @@ -20,8 +18,8 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define Wu(m,n) &(ws->W), m, n +#define A(m,n) A, m, n +#define W(m,n) &(ws->W), m, n /** * Permutation of the panel n at step k @@ -46,22 +44,65 @@ chameleon_pzlaswp_panel_permute( struct chameleon_pzlaswp_s *ws, withlacpy = options->withlacpy; options->withlacpy = 1; INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - A(k, n), Wu(A->myrank, n) ); + A(k, n), W(A->myrank, n) ); options->withlacpy = withlacpy; INSERT_TASK_zlaswp_get( options, dir, k*A->mb, tempkm, - ipiv, k, A(k, n), Wu(A->myrank, n) ); + ipiv, k, A(k, n), W(A->myrank, n) ); for ( m = k + 1; m < A->mt; m++ ) { /* Extract selected rows into A(k, n) */ INSERT_TASK_zlaswp_get( options, dir, m*A->mb, tempkm, - ipiv, k, A(m, n), Wu(A->myrank, n) ); + ipiv, k, A(m, n), W(A->myrank, n) ); /* Copy rows from A(k,n) into their final position */ INSERT_TASK_zlaswp_set( options, dir, m*A->mb, tempkm, - ipiv, k, A(k, n), A(m, n) ); + ipiv, k, A(k, n), A(m, n) ); } - INSERT_TASK_zperm_allreduce_row( options, dir, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); + INSERT_TASK_zperm_allreduce_row( options, dir, A, W(A->myrank, n), ipiv, k, k, n, ws ); +} + +/** + * Permutation of the panel n at step k + */ +static inline void +chameleon_pzlaswp_panel_permute_batched( struct chameleon_pzlaswp_s *ws, + cham_dir_t dir, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options ) +{ + int m; + int tempkm, tempnn; + int withlacpy; + + void **clargs = malloc( sizeof(char *) ); + *clargs = NULL; + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + tempnn = A->get_blkdim( A, n, DIM_n, A->n ); + + /* Extract selected rows into U */ + withlacpy = options->withlacpy; + options->withlacpy = 1; + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + A(k, n), W(A->myrank, n) ); + options->withlacpy = withlacpy; + + INSERT_TASK_zlaswp_get( options, dir, k*A->mb, tempkm, + ipiv, k, A(k, n), W(A->myrank, n) ); + + for ( m = k + 1; m < A->mt; m++ ) { + INSERT_TASK_zlaswp_batched( options, dir, m*A->mb, tempkm, (void *)ws, ipiv, k, + A(m, n), A(k, n), W(A->myrank, n), clargs ); + } + INSERT_TASK_zlaswp_batched_flush( options, dir, ipiv, k, A(k, n), W(A->myrank, n), clargs ); + + INSERT_TASK_zperm_allreduce_row( options, dir, A, W(A->myrank, n), ipiv, k, k, n, ws ); + + free( clargs ); } static inline void @@ -92,14 +133,19 @@ chameleon_pzlaswp_panel( struct chameleon_pzlaswp_s *ws, } #endif - chameleon_pzlaswp_panel_permute( ws, dir, A, ipiv, k, n, options ); + if ( ws->batch_size_swap == 0 ){ + chameleon_pzlaswp_panel_permute( ws, dir, A, ipiv, k, n, options ); + } + else { + chameleon_pzlaswp_panel_permute_batched( ws, dir, A, ipiv, k, n, options ); + } if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - Wu(A->myrank, n), A(k, n) ); + W(A->myrank, n), A(k, n) ); RUNTIME_data_flush( sequence, A(k, n) ); } (void)reduce; @@ -145,4 +191,3 @@ chameleon_pzlaswp( struct chameleon_pzlaswp_s *ws, } RUNTIME_options_finalize( &options, chamctxt ); } - diff --git a/compute/pzlaswpc.c b/compute/pzlaswpc.c index a3328070bdf6717ed4614f42f9493f8ffdcb36db..a62e28319d0561add56479bec503810611cd1bf0 100644 --- a/compute/pzlaswpc.c +++ b/compute/pzlaswpc.c @@ -18,8 +18,8 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define Wc(m,n) &(ws->W), m, n +#define A(m,n) A, m, n +#define W(m,n) &(ws->W), m, n /** * Permutation of the panel n at step k @@ -34,32 +34,75 @@ chameleon_pzlaswpc_panel_permute( struct chameleon_pzlaswp_s *ws, RUNTIME_option_t *options ) { int n; - int tempkn, tempmm; + int tempmm, tempkn; int withlacpy; - tempkn = A->get_blkdim( A, k, DIM_n, A->n ); tempmm = A->get_blkdim( A, m, DIM_m, A->m ); + tempkn = A->get_blkdim( A, k, DIM_n, A->n ); /* Extract selected rows into U */ withlacpy = options->withlacpy; options->withlacpy = 1; INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkn, - A(m, k), Wc(m, A->myrank) ); + A(m, k), W(m, A->myrank) ); options->withlacpy = withlacpy; INSERT_TASK_zlaswpc_get( options, dir, tempkn, k*A->nb, - ipiv, k, A(m, k), Wc(m, A->myrank) ); + ipiv, k, A(m, k), W(m, A->myrank) ); for ( n = k + 1; n < A->nt; n++ ) { /* Extract selected rows into A(k, n) */ INSERT_TASK_zlaswpc_get( options, dir, tempkn, n*A->nb, - ipiv, k, A(m, n), Wc(m, A->myrank) ); + ipiv, k, A(m, n), W(m, A->myrank) ); /* Copy rows from A(k,n) into their final position */ INSERT_TASK_zlaswpc_set( options, dir, tempkn, n*A->nb, - ipiv, k, A(m, k), A(m, n) ); + ipiv, k, A(m, k), A(m, n) ); + } + + INSERT_TASK_zperm_allreduce_col( options, dir, A, W(m, A->myrank), ipiv, k, m, k, ws ); +} + +/** + * Permutation of the panel n at step k + */ +static inline void +chameleon_pzlaswpc_panel_permute_batched( struct chameleon_pzlaswp_s *ws, + cham_dir_t dir, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int m, + int k, + RUNTIME_option_t *options ) +{ + int n; + int tempmm, tempkn; + int withlacpy; + + void **clargs = malloc( sizeof(char *) ); + *clargs = NULL; + + tempmm = A->get_blkdim( A, m, DIM_m, A->m ); + tempkn = A->get_blkdim( A, k, DIM_n, A->n ); + + /* Extract selected rows into U */ + withlacpy = options->withlacpy; + options->withlacpy = 1; + INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkn, + A(m, k), W(m, A->myrank) ); + options->withlacpy = withlacpy; + + INSERT_TASK_zlaswpc_get( options, dir, tempkn, k*A->nb, + ipiv, k, A(m, k), W(m, A->myrank) ); + + for ( n = k + 1; n < A->nt; n++ ) { + INSERT_TASK_zlaswpc_batched( options, dir, n*A->nb, tempkn, (void *)ws, ipiv, k, + A(m, n), A(m, k), W(m, A->myrank), clargs ); } + INSERT_TASK_zlaswpc_batched_flush( options, dir, ipiv, k, A(m, k), W(m, A->myrank), clargs ); + + INSERT_TASK_zperm_allreduce_col( options, dir, A, W(m, A->myrank), ipiv, k, m, k, ws ); - INSERT_TASK_zperm_allreduce_col( options, dir, A, Wc(m, A->myrank), ipiv, k, m, k, ws ); + free( clargs ); } static inline void @@ -73,7 +116,7 @@ chameleon_pzlaswpc_panel( struct chameleon_pzlaswp_s *ws, RUNTIME_sequence_t *sequence ) { CHAM_reduce_t *reduce = &(ws->reduce); - int tempkn, tempmm; + int tempmm, tempkn; #if defined(CHAMELEON_USE_MPI) chameleon_get_proc_involved_in_rowpanelk_2dbc( A, m, k, reduce ); @@ -90,14 +133,19 @@ chameleon_pzlaswpc_panel( struct chameleon_pzlaswp_s *ws, } #endif - chameleon_pzlaswpc_panel_permute( ws, dir, A, ipiv, m, k, options ); + if ( ws->batch_size_swap == 0 ){ + chameleon_pzlaswpc_panel_permute( ws, dir, A, ipiv, m, k, options ); + } + else { + chameleon_pzlaswpc_panel_permute_batched( ws, dir, A, ipiv, m, k, options ); + } if ( A->myrank == chameleon_getrankof_2d( A, m, k ) ) { - tempkn = A->get_blkdim( A, k, DIM_n, A->n ); tempmm = A->get_blkdim( A, m, DIM_m, A->m ); + tempkn = A->get_blkdim( A, k, DIM_n, A->n ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkn, - Wc(m, A->myrank), A(m, k) ); + W(m, A->myrank), A(m, k) ); RUNTIME_data_flush( sequence, A(m, k) ); } (void)reduce; @@ -143,4 +191,3 @@ chameleon_pzlaswpc( struct chameleon_pzlaswp_s *ws, } RUNTIME_options_finalize( &options, chamctxt ); } - diff --git a/compute/zgetrf.c b/compute/zgetrf.c index ea1d50bc8c7251ffeb37a0b8c9e0668cfe31792f..8d733241ed174d240078dcb918b3fa3d8c4d0d9c 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -105,8 +105,9 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) ws->batch_size_blas2 = ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas2; ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", batch_size ); ws->batch_size_blas3 = ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas3; - ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", batch_size ); - ws->batch_size_swap = ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_swap; + + ws->laswp->batch_size_swap = ( ws->laswp->batch_size_swap == 0 ) ? batch_size : ws->laswp->batch_size_swap; + ws->laswp->batch_size_swap = ( ws->laswp->batch_size_swap > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->laswp->batch_size_swap; ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); diff --git a/compute/zlaswp.c b/compute/zlaswp.c index f00140efedbd5cc829d73cb581820720bfa0217c..5b0783d9e5ae1b31842919cb38b515c9b32a58d7 100644 --- a/compute/zlaswp.c +++ b/compute/zlaswp.c @@ -85,6 +85,12 @@ CHAMELEON_zlaswp_WS_Alloc( cham_side_t side, const CHAM_desc_t *A ) chameleon_cleanenv( allreduce ); } + ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_LASWP_BATCH_SIZE", 0 ); + if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_LASWP_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_LASWP_BATCH_SIZE value\n" ); + ws->batch_size_swap = CHAMELEON_BATCH_SIZE; + } + if ( side == ChamLeft ) { chameleon_desc_init( &(ws->W), CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, A->mb, A->nb, A->mb*A->nb, diff --git a/control/compute_z.h b/control/compute_z.h index e747c2ac2203c63266acf4c928ad5a28495e0a86..d7482d0dc69d5910835b2bc81579db7bed04342a 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -44,8 +44,9 @@ struct chameleon_pzgemm_s { * @brief Data structure to handle the LASWP workspaces */ struct chameleon_pzlaswp_s { - CHAM_desc_t W; /**< Workspace used for the row/column permutation. */ - CHAM_reduce_t reduce; /**< Structure for reduction operations */ + CHAM_desc_t W; /**< Workspace used for the row/column permutation. */ + CHAM_reduce_t reduce; /**< Structure for reduction operations */ + int batch_size_swap; /**< Batch size for the permutation */ }; /** @@ -57,7 +58,6 @@ struct chameleon_pzgetrf_s { int ib; /**< Internal blocking parameter */ int batch_size_blas2; /**< Batch size for the blas 2 operations of the panel factorization */ int batch_size_blas3; /**< Batch size for the blas 3 operations of the panel factorization */ - int batch_size_swap; /**< Batch size for the permutation */ int ringswitch; /**< Define when to switch to ring bcast */ CHAM_desc_t U; /**< Workspaces used for the panels permutation in getrf without pivoting */ CHAM_desc_t Up; /**< Workspace used for the panel factorization */ diff --git a/coreblas/compute/core_zlaswp.c b/coreblas/compute/core_zlaswp.c index 0160e5c2e0d6cd0edb8dacb4b772142d79431a3f..b0d6c075055ea23e37a6e9b3cbaaf1ddc3d29216 100644 --- a/coreblas/compute/core_zlaswp.c +++ b/coreblas/compute/core_zlaswp.c @@ -24,8 +24,8 @@ * * @ingroup CORE_CHAMELEON_Complex64_t * - * CORE_zlaswp_get extracts the rows from the tile B that have been selected as - * pivot into the tile A. + * CORE_zlaswp_get extracts the rows from the tile A that have been selected as + * pivot into the tile B. * ******************************************************************************* * @@ -54,7 +54,7 @@ * On entry, a matrix of size ldb-by-n with 0s or already collected * rows. * On exit, B is filled with the selected rows from A, such that for - * each row i, B[i] = A[perm[i]-m0-1]. + * each row i, B[i,:] = A[perm[i]-m0-1,:]. * * @param[in] ldb * The leading dimension of the array B. ldb >= max(1,k). @@ -155,7 +155,7 @@ CORE_zlaswp_get( int m0, int m, int n, int k, * @param[inout] B * On entry, a matrix of size ldb-by-n that may require some pivoted rows. * On exit, B is updated with the pivoted rows it needs to receive, such that for - * each row i, A[i] = B[invp[i]-m0-1]. + * each row i, A[i,:] = B[invp[i]-m0-1,:]. * * @param[in] ldb * The leading dimension of the array B. ldb >= max(1,m). diff --git a/coreblas/compute/core_zlaswpc.c b/coreblas/compute/core_zlaswpc.c index 6ba0f627835b08124b61b041a76cced4823baee4..50f6cd684ce2fd24541e6457704259780a0698ec 100644 --- a/coreblas/compute/core_zlaswpc.c +++ b/coreblas/compute/core_zlaswpc.c @@ -23,8 +23,8 @@ * * @ingroup CORE_CHAMELEON_Complex64_t * - * CORE_zlaswpc_get extracts the columns from the tile B that have been selected as - * pivot into the tile A. + * CORE_zlaswpc_get extracts the columns from the tile A that have been selected as + * pivot into the tile B. * ******************************************************************************* * @@ -33,10 +33,10 @@ * belongs to. * * @param[in] m - * The number of rows of the matrix A. + * The number of rows of the matrices A and B. * * @param[in] n - * The number of columns of the matrices A and B. + * The number of columns of the matrix A. * * @param[in] k * The number of columns of the matrix B. This is the number of potential @@ -44,19 +44,19 @@ * * @param[in] A * On entry, the matrix A of dimension lda-by-n where to extract the - * pivot columns if some are selected in the range m0..m0+m. + * pivot columns if some are selected in the range n0..n0+n * * @param[in] lda * The leading dimension of the array A. lda >= max(1,m). * * @param[inout] B - * On entry, a matrix of size ldb-by-n with 0s or already collected + * On entry, a matrix of size ldb-by-k with 0s or already collected * columns. * On exit, B is filled with the selected columns from A, such that for - * each row i, B[i] = A[perm[i]-m0-1]. + * each column i, B[:,i] = A[:,perm[i]-n0-1]. * * @param[in] ldb - * The leading dimension of the array B. ldb >= max(1,k). + * The leading dimension of the array B. ldb >= max(1,m). * * @param[in] perm * The permutation array of dimension k. @@ -110,7 +110,6 @@ CORE_zlaswpc_get( int n0, int m, int n, int k, { int idx = perm[i] - n0; - if ( ( idx >= 0 ) && (idx < n ) ) { cblas_zcopy( m, A + idx * lda, 1, @@ -136,10 +135,10 @@ CORE_zlaswpc_get( int n0, int m, int n, int k, * belongs to. * * @param[in] m - * The number of rows of the matrix B. + * The number of rows of the matrices A and B. * * @param[in] n - * The number of columns of the matrices A and B. + * The number of columns of the matrix B. * * @param[in] k * The number of columns of the matrix A. This is the number of potential @@ -150,12 +149,12 @@ CORE_zlaswpc_get( int n0, int m, int n, int k, * pivoted columns. * * @param[in] lda - * The leading dimension of the array A. lda >= max(1,k). + * The leading dimension of the array A. lda >= max(1,m). * * @param[inout] B - * On entry, a matrix of size ldb-by-n that may require some pivoted columns. + * On entry, a matrix of size ldb-by-k that may require some pivoted columns. * On exit, B is updated with the pivoted columns it needs to receive, such that for - * each column i, A[i] = B[invp[i]-m0-1]. + * each column i, A[:,i] = B[:,invp[i]-n0-1]. * * @param[in] ldb * The leading dimension of the array B. ldb >= max(1,m). diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index abe3a2c543dfe1e196163f9ae84e8812e97b0bed..916208a605c4182a5272055077429bd4c97178b4 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -209,7 +209,8 @@ void INSERT_TASK_zlaswpc_set( const RUNTIME_option_t *options, cham_dir_t dir, const CHAM_desc_t *tileA, int tileAm, int tileAn, const CHAM_desc_t *tileB, int tileBm, int tileBn ); void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, - int m0, int minmn, + cham_dir_t dir, + int m0, int m, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *Am, int Amm, int Amn, @@ -217,10 +218,26 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ); void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *Ak, int Akm, int Akn, const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ); +void INSERT_TASK_zlaswpc_batched( const RUNTIME_option_t *options, + cham_dir_t dir, + int n0, int n, + void *ws, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *An, int Anm, int Ann, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr ); +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr ); void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 2f1254d2c02993e5b4b96a177e42b4b760fd6020..0d2f52bd30b54580d806cd1bf3158089f3da5153 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -85,6 +85,7 @@ set(CODELETS_ZSRC codelets/codelet_zlaswp.c codelets/codelet_zlaswpc.c codelets/codelet_zlaswp_batched.c + codelets/codelet_zlaswpc_batched.c codelets/codelet_zlatro.c codelets/codelet_zlauum.c codelets/codelet_zperm_allreduce.c diff --git a/runtime/openmp/codelets/codelet_zlaswp_batched.c b/runtime/openmp/codelets/codelet_zlaswp_batched.c index 3ce953cf8f888d83b74abe0550dccf381d390df5..f3d922de628f9299434fb63d6c4ba83b3421e6b9 100644 --- a/runtime/openmp/codelets/codelet_zlaswp_batched.c +++ b/runtime/openmp/codelets/codelet_zlaswp_batched.c @@ -19,8 +19,9 @@ #include "chameleon/tasks_z.h" void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, + cham_dir_t dir, int m0, - int minmn, + int m, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, @@ -36,8 +37,9 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)m0; - (void)minmn; + (void)m; (void)ws; (void)ipiv; (void)ipivk; @@ -54,6 +56,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *Ak, @@ -65,6 +68,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)Ak; diff --git a/runtime/openmp/codelets/codelet_zlaswpc.c b/runtime/openmp/codelets/codelet_zlaswpc.c index eabe24c2a4961a0dcd57b42852d9e2530d85b8c3..08c61b79116f9a0d451ad22589eb644bf0661c60 100644 --- a/runtime/openmp/codelets/codelet_zlaswpc.c +++ b/runtime/openmp/codelets/codelet_zlaswpc.c @@ -2,7 +2,7 @@ * * @file openmp/codelet_zlaswpc.c * - * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** diff --git a/runtime/openmp/codelets/codelet_zlaswpc_batched.c b/runtime/openmp/codelets/codelet_zlaswpc_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..c3cf522ce2f4601b48a44548e9094c2097ba0b63 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zlaswpc_batched.c @@ -0,0 +1,82 @@ +/** + * + * @file openmp/codelet_zlaswpc_batched.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon OpenMP codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2024-11-12 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zlaswpc_batched( const RUNTIME_option_t *options, + cham_dir_t dir, + int n0, + int n, + void *ws, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *An, + int Anm, + int Ann, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)n0; + (void)n; + (void)ws; + (void)ipiv; + (void)ipivk; + (void)An; + (void)Anm; + (void)Ann; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} + +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)ipiv; + (void)ipivk; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} diff --git a/runtime/parsec/codelets/codelet_zlaswp_batched.c b/runtime/parsec/codelets/codelet_zlaswp_batched.c index 646b823aaa926255049ac96e56770023e8b2f08b..db683cded28ce6b36b257e006e76bb9db1fa9b48 100644 --- a/runtime/parsec/codelets/codelet_zlaswp_batched.c +++ b/runtime/parsec/codelets/codelet_zlaswp_batched.c @@ -19,8 +19,9 @@ #include "chameleon/tasks_z.h" void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, + cham_dir_t dir, int m0, - int minmn, + int m, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, @@ -36,8 +37,9 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)m0; - (void)minmn; + (void)m; (void)ws; (void)ipiv; (void)ipivk; @@ -54,6 +56,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *Ak, @@ -65,6 +68,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)Ak; diff --git a/runtime/parsec/codelets/codelet_zlaswpc.c b/runtime/parsec/codelets/codelet_zlaswpc.c index d7d55850738eab5f4201f69e2c280cf7dafff6c5..42a8b03c97a589afd606f31d25de7042734f8f47 100644 --- a/runtime/parsec/codelets/codelet_zlaswpc.c +++ b/runtime/parsec/codelets/codelet_zlaswpc.c @@ -2,7 +2,7 @@ * * @file parsec/codelet_zlaswpc.c * - * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** diff --git a/runtime/parsec/codelets/codelet_zlaswpc_batched.c b/runtime/parsec/codelets/codelet_zlaswpc_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..6336558d6a6a669a6681bf7cbfb17ff1c6aa42e2 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zlaswpc_batched.c @@ -0,0 +1,82 @@ +/** + * + * @file parsec/codelet_zlaswpc_batched.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon Parsec codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2024-11-12 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zlaswpc_batched( const RUNTIME_option_t *options, + cham_dir_t dir, + int n0, + int n, + void *ws, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *An, + int Anm, + int Ann, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)n0; + (void)n; + (void)ws; + (void)ipiv; + (void)ipivk; + (void)An; + (void)Anm; + (void)Ann; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} + +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)ipiv; + (void)ipivk; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} diff --git a/runtime/quark/codelets/codelet_zlaswp_batched.c b/runtime/quark/codelets/codelet_zlaswp_batched.c index 2193c0943300f99ed2f0226688dcd3fcf1b0d6db..a285177b717adf9642429e6a902471317f9bf62d 100644 --- a/runtime/quark/codelets/codelet_zlaswp_batched.c +++ b/runtime/quark/codelets/codelet_zlaswp_batched.c @@ -19,8 +19,9 @@ #include "chameleon/tasks_z.h" void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, + cham_dir_t dir, int m0, - int minmn, + int m, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, @@ -36,8 +37,9 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)m0; - (void)minmn; + (void)m; (void)ws; (void)ipiv; (void)ipivk; @@ -54,6 +56,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *Ak, @@ -65,6 +68,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, void **clargs_ptr ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)Ak; diff --git a/runtime/quark/codelets/codelet_zlaswpc.c b/runtime/quark/codelets/codelet_zlaswpc.c index 1ef79931f706278e73d3ff628126b03081f4102f..bff7d84200f63add99feb5f2f2544a1ad2540bb8 100644 --- a/runtime/quark/codelets/codelet_zlaswpc.c +++ b/runtime/quark/codelets/codelet_zlaswpc.c @@ -2,7 +2,7 @@ * * @file quark/codelet_zlaswpc.c * - * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** diff --git a/runtime/quark/codelets/codelet_zlaswpc_batched.c b/runtime/quark/codelets/codelet_zlaswpc_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..d5152eababe0d24929a27fef5bb4b9ac615f3b48 --- /dev/null +++ b/runtime/quark/codelets/codelet_zlaswpc_batched.c @@ -0,0 +1,82 @@ +/** + * + * @file quark/codelet_zlaswpc_batched.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon quark codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2024-11-12 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zlaswpc_batched( const RUNTIME_option_t *options, + cham_dir_t dir, + int n0, + int n, + void *ws, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *An, + int Anm, + int Ann, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)n0; + (void)n; + (void)ws; + (void)ipiv; + (void)ipivk; + (void)An; + (void)Anm; + (void)Ann; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} + +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + (void)options; + (void)dir; + (void)ipiv; + (void)ipivk; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; +} diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index e7f330d1be0fc38d4e66eb8fa9baf5889ca77dd9..995508105593cf948182355f68eeba307b317969 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -110,9 +110,9 @@ if ( STARPU_FOUND ) #include <starpu.h> int main() { if (STARPU_NONE == 0) - return 1; + return EXIT_FAILURE; else - return 0; + return EXIT_SUCCESS; } ") diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c index 8cc2a3adc593c698f3d79163781f44bd59b92d6e..25b0a3cde6519eb62d6616d46763dfd6941a9f4f 100644 --- a/runtime/starpu/codelets/codelet_zlaswp_batched.c +++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c @@ -11,7 +11,8 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @author Matteo Marcos + * @date 2025-04-10 * @precisions normal z -> c d s * */ @@ -20,7 +21,7 @@ struct cl_zlaswp_batched_args_s { int tasks_nbr; - int minmn; + int m; int m0[CHAMELEON_BATCH_SIZE]; struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE]; }; @@ -30,21 +31,21 @@ static void cl_zlaswp_batched_cpu_func( void *descr[], void *cl_arg ) { - int i, m0, minmn, *perm, *invp; + int i, m0, m, *permget, *permset; CHAM_tile_t *A, *U, *B; struct cl_zlaswp_batched_args_s *clargs = ( struct cl_zlaswp_batched_args_s * ) cl_arg; - minmn = clargs->minmn; - perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); - invp = (int *)STARPU_VECTOR_GET_PTR( descr[1] ); - U = (CHAM_tile_t *) cti_interface_get( descr[2] ); - B = (CHAM_tile_t *) cti_interface_get( descr[3] ); + m = clargs->m; + permget = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); + permset = (int *)STARPU_VECTOR_GET_PTR( descr[1] ); + U = (CHAM_tile_t *) cti_interface_get( descr[2] ); + B = (CHAM_tile_t *) cti_interface_get( descr[3] ); for ( i = 0; i < clargs->tasks_nbr; i++ ) { A = (CHAM_tile_t *) cti_interface_get( descr[ i + 4 ] ); m0 = clargs->m0[ i ]; - TCORE_zlaswp_get( m0, A->m, A->n, minmn, A, U, perm ); - TCORE_zlaswp_set( m0, A->m, A->n, minmn, B, A, invp ); + TCORE_zlaswp_get( m0, A->m, A->n, m, A, U, permget ); + TCORE_zlaswp_set( m0, A->m, A->n, m, B, A, permset ); } } #endif @@ -55,24 +56,18 @@ cl_zlaswp_batched_cpu_func( void *descr[], CODELETS_CPU( zlaswp_batched, cl_zlaswp_batched_cpu_func ) void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, + cham_dir_t dir, int m0, - int minmn, + int m, void *ws, - const CHAM_ipiv_t *ipiv, - int ipivk, - const CHAM_desc_t *Am, - int Amm, - int Amn, - const CHAM_desc_t *Ak, - int Akm, - int Akn, - const CHAM_desc_t *U, - int Um, - int Un, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Am, int Amm, int Amn, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ) { int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap; + int batch_size = ((struct chameleon_pzlaswp_s *)ws)->batch_size_swap; struct cl_zlaswp_batched_args_s *clargs = *clargs_ptr; if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) { return; @@ -81,7 +76,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, if( clargs == NULL ) { clargs = malloc( sizeof( struct cl_zlaswp_batched_args_s ) ) ; clargs->tasks_nbr = 0; - clargs->minmn = minmn; + clargs->m = m; *clargs_ptr = clargs; } @@ -92,36 +87,43 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, clargs->tasks_nbr ++; if ( clargs->tasks_nbr == batch_size ) { - INSERT_TASK_zlaswp_batched_flush( options, ipiv, ipivk, Ak, Akm, Akn, U, Um, Un, clargs_ptr ); + INSERT_TASK_zlaswp_batched_flush( options, dir, ipiv, ipivk, Ak, Akm, Akn, U, Um, Un, clargs_ptr ); } } #if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - const CHAM_ipiv_t *ipiv, - int ipivk, - const CHAM_desc_t *Ak, - int Akm, - int Akn, - const CHAM_desc_t *U, - int Um, - int Un, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ) { struct cl_zlaswp_batched_args_s *clargs = *clargs_ptr; - int nhandles; + int nhandles; + void *ipiv_handle_get; + void *ipiv_handle_set; if( clargs == NULL ) { return; } + if ( dir == ChamDirForward ){ + ipiv_handle_get = RUNTIME_perm_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle_get = RUNTIME_invp_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + nhandles = clargs->tasks_nbr; rt_starpu_insert_task( &cl_zlaswp_batched, STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_batched_args_s), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), - STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle_get, + STARPU_R, ipiv_handle_set, STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), STARPU_R, RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, nhandles, @@ -136,24 +138,31 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, #else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - const CHAM_ipiv_t *ipiv, - int ipivk, - const CHAM_desc_t *Ak, - int Akm, - int Akn, - const CHAM_desc_t *U, - int Um, - int Un, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ) { - int ret, k; - struct starpu_task *task; + int ret, k; + struct starpu_task *task; struct cl_zlaswp_batched_args_s *myclargs = *clargs_ptr; + void *ipiv_handle_get; + void *ipiv_handle_set; if( myclargs == NULL ) { return; } + if ( dir == ChamDirForward ){ + ipiv_handle_get = RUNTIME_perm_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle_get = RUNTIME_invp_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + INSERT_TASK_COMMON_PARAMETERS( zlaswp_batched, myclargs->tasks_nbr + 4 ); /* @@ -161,10 +170,10 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, */ starpu_cham_exchange_init_params( options, ¶ms, Ak->myrank ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_perm_getaddr( ipiv, ipivk ), + ipiv_handle_get, STARPU_R ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_invp_getaddr( ipiv, ipivk ), + ipiv_handle_set, STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), STARPU_RW | STARPU_COMMUTE ); diff --git a/runtime/starpu/codelets/codelet_zlaswpc.c b/runtime/starpu/codelets/codelet_zlaswpc.c index 6b468fb8b62cb39de982b13ec7f6b6b4ce84037c..ac2ca5108431785ad8faf2609a2401ab0417ff04 100644 --- a/runtime/starpu/codelets/codelet_zlaswpc.c +++ b/runtime/starpu/codelets/codelet_zlaswpc.c @@ -2,7 +2,7 @@ * * @file starpu/codelet_zlaswpc.c * - * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** diff --git a/runtime/starpu/codelets/codelet_zlaswpc_batched.c b/runtime/starpu/codelets/codelet_zlaswpc_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..8f59e7eb33e7564e79239dedcc36860478e1b1d6 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zlaswpc_batched.c @@ -0,0 +1,214 @@ +/** + * + * @file starpu/codelet_zlaswpc_batched.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to apply zlaswp on a row panel + * + * @version 1.3.0 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2025-04-10 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu_internal.h" +#include "runtime_codelet_z.h" + +struct cl_zlaswpc_batched_args_s { + int tasks_nbr; + int n; + int n0[CHAMELEON_BATCH_SIZE]; + struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE]; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zlaswpc_batched_cpu_func( void *descr[], + void *cl_arg ) +{ + int i, n0, n, *permget, *permset; + CHAM_tile_t *A, *U, *B; + struct cl_zlaswpc_batched_args_s *clargs = ( struct cl_zlaswpc_batched_args_s * ) cl_arg; + + n = clargs->n; + permget = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); + permset = (int *)STARPU_VECTOR_GET_PTR( descr[1] ); + U = (CHAM_tile_t *) cti_interface_get( descr[2] ); + B = (CHAM_tile_t *) cti_interface_get( descr[3] ); + + for ( i = 0; i < clargs->tasks_nbr; i++ ) { + A = (CHAM_tile_t *) cti_interface_get( descr[ i + 4 ] ); + n0 = clargs->n0[ i ]; + TCORE_zlaswpc_get( n0, A->m, A->n, n, A, U, permget ); + TCORE_zlaswpc_set( n0, A->m, A->n, n, B, A, permset ); + } +} +#endif + +/* + * Codelet definition + */ +CODELETS_CPU( zlaswpc_batched, cl_zlaswpc_batched_cpu_func ) + +void INSERT_TASK_zlaswpc_batched( const RUNTIME_option_t *options, + cham_dir_t dir, + int n0, + int n, + void *ws, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *An, int Anm, int Ann, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr ) +{ + int task_num = 0; + int batch_size = ((struct chameleon_pzlaswp_s *)ws)->batch_size_swap; + struct cl_zlaswpc_batched_args_s *clargs = *clargs_ptr; + if ( An->get_rankof( An, Anm, Ann) != An->myrank ) { + return; + } + + if( clargs == NULL ) { + clargs = malloc( sizeof( struct cl_zlaswpc_batched_args_s ) ) ; + clargs->tasks_nbr = 0; + clargs->n = n; + *clargs_ptr = clargs; + } + + task_num = clargs->tasks_nbr; + clargs->n0[ task_num ] = n0; + clargs->handle_mode[ task_num ].handle = RTBLKADDR(An, CHAMELEON_Complex64_t, Anm, Ann); + clargs->handle_mode[ task_num ].mode = STARPU_RW; + clargs->tasks_nbr ++; + + if ( clargs->tasks_nbr == batch_size ) { + INSERT_TASK_zlaswpc_batched_flush( options, dir, ipiv, ipivk, Ak, Akm, Akn, U, Um, Un, clargs_ptr ); + } +} + +#if defined(CHAMELEON_STARPU_USE_INSERT) + +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr ) +{ + struct cl_zlaswpc_batched_args_s *clargs = *clargs_ptr; + int nhandles; + void *ipiv_handle_get; + void *ipiv_handle_set; + + if( clargs == NULL ) { + return; + } + + if ( dir == ChamDirForward ){ + ipiv_handle_get = RUNTIME_perm_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle_get = RUNTIME_invp_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + + nhandles = clargs->tasks_nbr; + rt_starpu_insert_task( + &cl_zlaswpc_batched, + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswpc_batched_args_s), + STARPU_R, ipiv_handle_get, + STARPU_R, ipiv_handle_set, + STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), + STARPU_R, RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn), + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, nhandles, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, + 0 ); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; +} + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zlaswpc_batched_flush( const RUNTIME_option_t *options, + cham_dir_t dir, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr ) +{ + int ret, k; + struct starpu_task *task; + struct cl_zlaswpc_batched_args_s *myclargs = *clargs_ptr; + void *ipiv_handle_get; + void *ipiv_handle_set; + + if( myclargs == NULL ) { + return; + } + + if ( dir == ChamDirForward ){ + ipiv_handle_get = RUNTIME_perm_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle_get = RUNTIME_invp_getaddr( ipiv, ipivk ); + ipiv_handle_set = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + + INSERT_TASK_COMMON_PARAMETERS( zlaswpc_batched, myclargs->tasks_nbr + 4 ); + + /* + * Register the data handles, might need to receive perm and invp + */ + starpu_cham_exchange_init_params( options, ¶ms, Ak->myrank ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + ipiv_handle_get, + STARPU_R ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + ipiv_handle_set, + STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), + STARPU_RW | STARPU_COMMUTE ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( Ak, ChamComplexDouble, Akm, Akn ), STARPU_R ); + for ( k = 0; k < myclargs->tasks_nbr; k++ ) { + starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW ); + } + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + task->cl_arg = myclargs; + task->cl_arg_size = sizeof( struct cl_zlaswpc_batched_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zlaswpc_batched", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + (void)clargs; + (void)cl_name; +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index e04ed9503b376caa599e5d92fa905b53b74a7f2d..145838d04e6b6faa7bf2dccb082196eef73f3192 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -113,6 +113,10 @@ if (NOT CHAMELEON_SIMULATION) add_test( test_${cat}_${prec}laswp ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/laswp.in ) add_test( test_${cat}_${prec}getrs ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrs.in ) add_test( test_${cat}_${prec}gesv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/gesv.in ) + + add_test( test_${cat}_${prec}laswp_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/laswp.in ) + set_tests_properties( test_${cat}_${prec}laswp_batch + PROPERTIES ENVIRONMENT "CHAMELEON_LASWP_BATCH_SIZE=3" ) if ( ${cat} STREQUAL "mpi" ) add_test( test_${cat}_${prec}laswp_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/laswp.in ) add_test( test_${cat}_${prec}getrs_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/getrs.in )