diff --git a/CMakeLists.txt b/CMakeLists.txt index f90423ad4ea2e96edb6190820b5c2e5a8c875e2b..44b3fff3f6699f82de1df192d6980aa0f137c7b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,6 +306,9 @@ cmake_dependent_option(CHAMELEON_TESTINGS_VENDOR "Generate testings of the blas option(CHAMELEON_KERNELS_MT "Use multithreaded kernels (e.g. intel MKL MT)" OFF) #------------------------------------------------------------------------------ +# Option for the maximum batch size +set(CHAMELEON_BATCH_SIZE 10 CACHE STRING "Maximum size for the batched kernels") + ############################################################################### # Build dependency HQR library # ################################ diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 99152bc04575e72ab83819df54472161b57406f6..e6f3d107a911b0a29ed40fffe204f7f84ad5d259 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -157,6 +157,57 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); } +/* + * Factorization of panel k - dynamic scheduling - batched version / stock + */ +static inline void +chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + RUNTIME_option_t *options ) +{ + int m, h; + int tempkm, tempkn, tempmm, minmn; + void **clargs = malloc( sizeof(char *) ); + memset( clargs, 0, sizeof(char *) ); + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Update the number of column */ + ipiv->n = minmn; + + /* + * Algorithm per column with pivoting (no recursion) + */ + /* Iterate on current panel column */ + /* Since index h scales column h-1, we need to iterate up to minmn (included) */ + for ( h = 0; h <= minmn; h++ ) { + + INSERT_TASK_zgetrf_percol_diag( options, tempkm, tempkn, h, k * A->mb, A(k, k), ipiv ); + + for ( m = k+1; m < A->mt; m++ ) { + tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; + INSERT_TASK_zgetrf_panel_offdiag_batched( options, tempmm, tempkn, h, m * A->mb, + (void *)ws, A(m, k), clargs, ipiv ); + } + INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); + + if ( h < minmn ) { + /* Reduce globally (between MPI processes) */ + INSERT_TASK_ipiv_reducek( options, ipiv, k, h ); + } + } + + free( clargs ); + + /* Flush temporary data used for the pivoting */ + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); +} + static inline void chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -221,6 +272,74 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); } +/* + * Factorization of panel k - dynamic scheduling - batched version / stock + */ +static inline void +chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + RUNTIME_option_t *options ) +{ + int m, h, b, nbblock, hmax, j; + int tempkm, tempkn, tempmm, minmn; + void **clargs = malloc( sizeof(char *) * A->p ); + memset( clargs, 0, sizeof(char *) * A->p ); + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Update the number of column */ + ipiv->n = minmn; + nbblock = chameleon_ceil( minmn, ws->ib ); + + /* + * Algorithm per column with pivoting (no recursion) + */ + /* Iterate on current panel column */ + /* Since index h scales column h-1, we need to iterate up to minmn (included) */ + for ( b = 0; b < nbblock; b++ ) { + hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib; + + for ( h = 0; h < hmax; h++ ) { + j = h + b * ws->ib; + + INSERT_TASK_zgetrf_panel_blocked_batched( options, tempkm, tempkn, j, k * A->mb, (void *)ws, + A(k, k), Up(k, k), clargs, ipiv ); + + for ( m = k + 1; m < A->mt; m++ ) { + tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; + INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb, + (void *)ws, A(m, k), Up(k, k), clargs, ipiv ); + } + INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, k, + Up(k, k), clargs, ipiv ); + + if ( (b < (nbblock-1)) && (h == hmax-1) ) { + INSERT_TASK_zgetrf_blocked_trsm( + options, + ws->ib, tempkn, b * ws->ib + hmax, ws->ib, + Up(k, k), + ipiv ); + } + + assert( j <= minmn ); + if ( j < minmn ) { + /* Reduce globally (between MPI processes) */ + INSERT_TASK_ipiv_reducek( options, ipiv, k, j ); + } + } + } + + free( clargs ); + + /* Flush temporary data used for the pivoting */ + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); +} + static inline void chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -235,11 +354,21 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPivPerColumn: - chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); + if ( ws->batch_size > 1 ) { + chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options ); + } + else { + chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); + } break; case ChamGetrfPPiv: - chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options ); + if ( ws->batch_size > 1 ) { + chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options ); + } + else { + chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options ); + } break; case ChamGetrfNoPiv: diff --git a/compute/zgetrf.c b/compute/zgetrf.c index a1887f8322b50bc316f69aae38274e777a2a3606..47d98cf7862754bdb691ab326afc7818adf65d3a 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -88,6 +88,16 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( algostr ); } + ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 1 ); + if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); + ws->batch_size = CHAMELEON_BATCH_SIZE; + } + if ( (ws->batch_size > 1) && (CHAMELEON_Comm_rank() > 1) ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE is unavailable in distributed, value forced to 1\n" ); + ws->batch_size = 1; + } + /* Allocation of U for permutation of the panels */ if ( ws->alg == ChamGetrfNoPivPerColumn ) { chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE, @@ -96,7 +106,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) A->mt, A->nt * A->nb, A->p, A->q, NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); } - else if ( ( ws->alg == ChamGetrfPPiv ) || + else if ( ( ws->alg == ChamGetrfPPiv ) || ( ws->alg == ChamGetrfPPivPerColumn ) ) { chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE, @@ -108,13 +118,14 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) /* Set ib to 1 if per column algorithm */ if ( ( ws->alg == ChamGetrfNoPivPerColumn ) || - ( ws->alg == ChamGetrfPPivPerColumn ) ) + ( ws->alg == ChamGetrfPPivPerColumn ) ) { ws->ib = 1; } /* Allocation of Up for the permutation of the diagonal panel per block */ - if ( ws->alg == ChamGetrfPPiv ) { + if ( ws->alg == ChamGetrfPPiv ) + { /* TODO: Should be restricted to diagonal tiles */ /* Possibly to a single handle with a permutation of the ownership */ chameleon_desc_init( &(ws->Up), CHAMELEON_MAT_ALLOC_TILE, @@ -157,7 +168,8 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) { chameleon_desc_destroy( &(ws->U) ); } - if ( ws->alg == ChamGetrfPPiv ) { + if ( ws->alg == ChamGetrfPPiv ) + { chameleon_desc_destroy( &(ws->Up) ); } free( ws ); diff --git a/control/compute_z.h b/control/compute_z.h index 2d0923155952c1e26782e2d6c50e4bcb2b6922a5..645018f833f835cccf50ea9f25858d5410e8fce2 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -43,7 +43,8 @@ struct chameleon_pzgemm_s { */ struct chameleon_pzgetrf_s { cham_getrf_t alg; - int ib; /* Internal blocking parameter */ + int ib; /**< Internal blocking parameter */ + int batch_size; /**< Batch size for the panel */ CHAM_desc_t U; CHAM_desc_t Up; }; diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in index c9ddebbfbd6f88e546adf73c1ec7dda84448578e..49885a8993781dfd8e0454862f91792c96e21688 100644 --- a/include/chameleon/config.h.in +++ b/include/chameleon/config.h.in @@ -79,6 +79,9 @@ /* chameleon compute */ #cmakedefine CHAMELEON_COPY_DIAG +/* Define the maximum batch size for kernels using it */ +#define CHAMELEON_BATCH_SIZE @CHAMELEON_BATCH_SIZE@ + /* chameleon runtime starpu */ #cmakedefine CHAMELEON_ENABLE_PRUNING_STATS diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index eb855ec34be8d002a9d99ab63276b32c722edb4b..b330ec7d840bb3136f8575e240bed5b8a9bc5847 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -529,6 +529,32 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); +void INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ); + +void INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ); + +void INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ); + +void INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ); + void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, int m, int n, int h, int ib, CHAM_desc_t *U, int Um, int Un, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index e834403398f8bfc2e370d6b475e26e7d89c9312a..f7203fbe500d517ea64251ea198600944ce9291c 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -68,6 +68,7 @@ set(CODELETS_ZSRC codelets/codelet_zgetrf_nopiv.c codelets/codelet_zgetrf_nopiv_percol.c codelets/codelet_zgetrf_percol.c + codelets/codelet_zgetrf_batched.c codelets/codelet_zgetrf_blocked.c codelets/codelet_zhe2ge.c codelets/codelet_zherfb.c diff --git a/runtime/openmp/codelets/codelet_zgetrf_batched.c b/runtime/openmp/codelets/codelet_zgetrf_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..c375bc8a976891d801208e639d74ca3d37677293 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgetrf_batched.c @@ -0,0 +1,102 @@ +/** + * + * @file starpu/codelet_zgetrf_batched.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpanel batched OpenMP codelets + * + * @version 1.2.0 + * @comment Codelets to perform batched panel factorization with partial pivoting + * + * @author Alycia Lisito + * @date 2024-04-02 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)A; + (void)Am; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)ws; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} diff --git a/runtime/parsec/codelets/codelet_zgetrf_batched.c b/runtime/parsec/codelets/codelet_zgetrf_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..8ee40464e5975238d55903442a93b0477f599b99 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgetrf_batched.c @@ -0,0 +1,102 @@ +/** + * + * @file starpu/codelet_zgetrf_batched.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpanel batched parsec codelets + * + * @version 1.2.0 + * @comment Codelets to perform batched panel factorization with partial pivoting + * + * @author Alycia Lisito + * @date 2024-04-02 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)A; + (void)Am; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)ws; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} diff --git a/runtime/quark/codelets/codelet_zgetrf_batched.c b/runtime/quark/codelets/codelet_zgetrf_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..d1b689d0cbabd8da168e80ad25577c35fad3b7b3 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgetrf_batched.c @@ -0,0 +1,102 @@ +/** + * + * @file starpu/codelet_zgetrf_batched.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpanel batched quark codelets + * + * @version 1.2.0 + * @comment Codelets to perform batched panel factorization with partial pivoting + * + * @author Alycia Lisito + * @date 2024-04-02 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)A; + (void)Am; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)m0; + (void)ws; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)A; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)clargs_ptr; + (void)ipiv; +} diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index bfbbcff9e866c48f97d86601a2938ce39f8b7cc4..4321e152f34ece6f99c50356b022beb8025dbe04 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -85,7 +85,7 @@ static void cl_zgemm_hip_func( void *descr[], void *cl_arg ) { struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; - hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c new file mode 100644 index 0000000000000000000000000000000000000000..eb9468198a6b0f639a150ac54ebb3e7b48f51a6e --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -0,0 +1,358 @@ +/** + * + * @file starpu/codelet_zgetrf_batched.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpanel batched StarPU codelets + * + * @version 1.2.0 + * @comment Codelets to perform batched panel factorization with partial pivoting + * + * @author Matthieu Kuhn + * @author Alycia Lisito + * @date 2024-01-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +struct cl_getrf_batched_args_t { + char *cl_name; + int tasks_nbr; + int diag; + int h; + int ib; + int m[CHAMELEON_BATCH_SIZE]; + int n[CHAMELEON_BATCH_SIZE]; + int m0[CHAMELEON_BATCH_SIZE]; + struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE]; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[], + void *cl_arg ) +{ + struct cl_getrf_batched_args_t *clargs = (struct cl_getrf_batched_args_t *) cl_arg; + cppi_interface_t *nextpiv = (cppi_interface_t*) descr[0]; + cppi_interface_t *prevpiv = (cppi_interface_t*) descr[1]; + int i, m, n, h, m0, lda; + CHAM_tile_t *tileA; + + nextpiv->h = clargs->h; + + for ( i = 0; i < clargs->tasks_nbr; i++ ) { + tileA = cti_interface_get( descr[ i + 2 ] ); + lda = tileA->ld; + m = clargs->m[ i ]; + n = clargs->n[ i ]; + h = clargs->h; + m0 = clargs->m0[ i ]; + CORE_zgetrf_panel_offdiag( m, n, h, m0, n, CHAM_tile_get_ptr(tileA), lda, + NULL, -1, &( nextpiv->pivot ), &( prevpiv->pivot ) ); + } +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +CODELETS_CPU( zgetrf_panel_offdiag_batched, cl_zgetrf_panel_offdiag_batched_cpu_func ) + +void +INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + int task_num = 0; + int exec = 0; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + void (*callback)(void*) = NULL; + struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( clargs == NULL ) { + clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ; + clargs->tasks_nbr = 0; + clargs->h = h; + clargs->cl_name = "zgetrf_panel_offdiag_batched"; + + *clargs_ptr = clargs; + } + + task_num = clargs->tasks_nbr; + clargs->m[ task_num ] = m; + clargs->n[ task_num ] = n; + clargs->m0[ task_num ] = m0; + clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); + clargs->handle_mode[ task_num ].mode = STARPU_RW; + clargs->tasks_nbr ++; + /* Refine name */ + clargs->cl_name = chameleon_codelet_name( clargs->cl_name, 1, + A->get_blktile( A, Am, An ) ); + + if ( clargs->tasks_nbr == batch_size ) { + rt_starpu_insert_task( + &cl_zgetrf_panel_offdiag_batched, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_REDUX, RUNTIME_pivot_getaddr( ipiv, An, h ), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, clargs->cl_name, +#endif + 0); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + } +} + +void +INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + void (*callback)(void*) = NULL; + struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + + if ( clargs == NULL ) { + return; + } + + rt_starpu_insert_task( + &cl_zgetrf_panel_offdiag_batched, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_REDUX, RUNTIME_pivot_getaddr( ipiv, An, clargs->h ), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, An, clargs->h-1 ), + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, clargs->cl_name, +#endif + 0); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; +} + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[], + void *cl_arg ) +{ + struct cl_getrf_batched_args_t *clargs = ( struct cl_getrf_batched_args_t * ) cl_arg; + int *ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr]); + cppi_interface_t *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1]; + cppi_interface_t *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 2]; + int i, h, ib; + CHAM_tile_t *tileA, *tileU; + CHAMELEON_Complex64_t *U = NULL; + int ldu = -1; + + nextpiv->h = clargs->h; + + h = clargs->h; + ib = clargs->ib; + i = 0; + if ( clargs->diag ) { + if ( h != 0 ) { + tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] ); + U = CHAM_tile_get_ptr( tileU ); + ldu = tileU->ld; + } + tileA = cti_interface_get( descr[ 0 ] ); + nextpiv->has_diag = 1; + CORE_zgetrf_panel_diag( clargs->m[i], clargs->n[i], h, clargs->m0[i], ib, + CHAM_tile_get_ptr( tileA ), tileA->ld, + U, ldu, + ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); + i++; + } + if ( ( h%ib == 0 ) && ( h > 0 ) ) { + tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] ); + U = CHAM_tile_get_ptr( tileU ); + ldu = tileU->ld; + } + else { + U = NULL; + ldu = -1; + } + for ( ; i < clargs->tasks_nbr; i++ ) { + tileA = cti_interface_get( descr[ i ] ); + CORE_zgetrf_panel_offdiag( clargs->m[i], clargs->n[i], h, clargs->m0[i], ib, + CHAM_tile_get_ptr(tileA), tileA->ld, + U, ldu, + &( nextpiv->pivot ), &( prevpiv->pivot ) ); + } +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +CODELETS_CPU( zgetrf_panel_blocked_batched, cl_zgetrf_panel_blocked_batched_cpu_func ) + +void +INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + void *ws, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + int ib = ((struct chameleon_pzgetrf_s *)ws)->ib; + int task_num = 0; + int exec = 0; + void (*callback)(void*) = NULL; + int accessU, access_npiv, access_ipiv, access_ppiv; + struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( clargs == NULL ) { + clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ; + clargs->tasks_nbr = 0; + clargs->diag = ( Am == An ); + clargs->ib = ib; + clargs->h = h; + clargs->cl_name = "zgetrf_panel_blocked_batched"; + + *clargs_ptr = clargs; + } + + task_num = clargs->tasks_nbr; + clargs->m[ task_num ] = m; + clargs->n[ task_num ] = n; + clargs->m0[ task_num ] = m0; + clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); + clargs->handle_mode[ task_num ].mode = STARPU_RW; + clargs->tasks_nbr ++; + /* Refine name */ + clargs->cl_name = chameleon_codelet_name( clargs->cl_name, 1, + A->get_blktile( A, Am, An ) ); + + if ( clargs->tasks_nbr == batch_size ) { + access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ipiv = STARPU_RW; + access_ppiv = STARPU_R; + accessU = STARPU_RW; + if ( clargs->h == 0 ) { + access_ipiv = STARPU_W; + access_ppiv = STARPU_NONE; + accessU = STARPU_NONE; + } + else if ( clargs->h % clargs->ib == 0 ) { + accessU = STARPU_R; + } + else if ( clargs->h % clargs->ib == 1 ) { + accessU = STARPU_W; + } + /* If there isn't a diag task then use offdiag access */ + if ( clargs->diag == 0 ) { + accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + } + + rt_starpu_insert_task( + &cl_zgetrf_panel_blocked_batched, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, clargs->cl_name, +#endif + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), + 0); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + } +} + +void +INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + int accessU, access_npiv, access_ipiv, access_ppiv; + void (*callback)(void*) = NULL; + struct cl_getrf_batched_args_t *clargs = *clargs_ptr; + + if ( clargs == NULL ) { + return; + } + + access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ipiv = STARPU_RW; + access_ppiv = STARPU_R; + accessU = STARPU_RW; + if ( clargs->h == 0 ) { + access_ipiv = STARPU_W; + access_ppiv = STARPU_NONE; + accessU = STARPU_NONE; + } + else if ( clargs->h % clargs->ib == 0 ) { + accessU = STARPU_R; + } + else if ( clargs->h % clargs->ib == 1 ) { + accessU = STARPU_W; + } + /* If there isn't a diag task then use offdiag access */ + if ( clargs->diag == 0 ) { + accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE; + } + + rt_starpu_insert_task( + &cl_zgetrf_panel_blocked_batched, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, clargs->cl_name, +#endif + STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, An, clargs->h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, clargs->h - 1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), + 0); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; +} diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index b429b5c02563d4d34889b94db36ddc4636663619..a1b637f681ed0bb82a981e65cd26310b03b514b7 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -94,9 +94,22 @@ if (NOT CHAMELEON_SIMULATION) set_tests_properties( test_${cat}_${prec}getrf_ppivpercol PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" ) + if ( ${cat} STREQUAL "shm" ) + add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" ) + endif() + add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppiv PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=1" ) + + if ( ${cat} STREQUAL "shm" ) + add_test( test_${cat}_${prec}getrf_ppiv_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppiv_batch + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" ) + endif() + endif() list( REMOVE_ITEM TESTSTMP print gepdf_qr )