diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 9a6ba171ac622a59b6fadfb9f3726ae57ddc3254..964097662c7381be1ce69fe30d543881d7eb81e9 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -22,6 +22,7 @@ * */ #include "control/common.h" +#include "include/chameleon/flops.h" #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n @@ -29,6 +30,31 @@ #define Wu(m,n) &(ws->Wu), m, n #define Wl(m,n) &(ws->Wl), m, n +static inline void +chameleon_pzgetrf_batch_size( struct chameleon_pzgetrf_s *ws, + int m, int nb, int j, int k ) +{ + if ( ws->batch_adaptive == 0 ) { + ws->batch_size = ( ( j % ws->ib ) != 0 ) ? ws->batch_size_blas2 : ws->batch_size_blas3; + return; + } + int task_left = m / nb - k; + int batch_max = chameleon_min( CHAMELEON_BATCH_SIZE, task_left ); + double flops = flops_zgetrf_blocked_offdiag( nb, nb, j, ws->ib ); + ws->batch_size = batch_max; + if ( j != 0 ) { + ws->batch_size = chameleon_min( chameleon_max( ws->flops_min / flops, 1 ), batch_max ); + } + + if ( task_left % ws->batch_size != 0 ) { + ws->batch_size = chameleon_min( chameleon_ceil( task_left, task_left / ws->batch_size ), batch_max ); + } + + // batch_mathieu = chameleon_ceil( task_left, + // chameleon_max( chameleon_ceil( task_left, batch_max ), + // ( task_left * flops ) / ws->flops_th ) ); +} + /* * All the functions below are panel factorization variant. * The parameters are: @@ -302,6 +328,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, for ( h = 0; h < hmax; h++ ) { j = h + b * ws->ib; + chameleon_pzgetrf_batch_size( ws, A->m, A->nb, j, k ); for ( m = k; m < A->mt; m++ ) { tempmm = A->get_blkdim( A, m, DIM_m, A->m ); INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb, @@ -558,14 +585,15 @@ chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws, } } +#if defined(CHAMELEON_USE_MPI) static inline void chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, int k, RUNTIME_option_t *options ) { - CHAM_context_t *chamctxt = chameleon_context_self(); - int m, tempmm, tempkn, tempkm, q; + CHAM_context_t *chamctxt = chameleon_context_self(); + int m, n, tempmm, tempkn, tempkm, p, q, involved, np; int lookahead = chamctxt->lookahead; int P = chameleon_desc_datadist_get_iparam(A, 0); int Q = chameleon_desc_datadist_get_iparam(A, 1); @@ -612,48 +640,40 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, } tempkm = A->get_blkdim( A, k, DIM_m, A->m ); -#if defined(CHAMELEON_USE_MPI) - { - int n, p, involved; - int np = chameleon_desc_datadist_get_iparam(A, 1) - * chameleon_desc_datadist_get_iparam(A, 0); - - /* Send Akk for replicated trsm */ - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - for ( p = 0; p < np; p++ ) { - involved = 0; - for ( n = k+1; n < A->nt; n++ ) { - if ( chameleon_p_involved_in_panelk_2dbc( A, n, p ) ) { - involved = 1; - break; - } - } - if ( involved ) { - INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, - A(k, k), Wu(p, k) ); - } - } - } - else { + np = chameleon_desc_datadist_get_iparam(A, 1) * chameleon_desc_datadist_get_iparam(A, 0); + + /* Send Akk for replicated trsm */ + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + for ( p = 0; p < np; p++ ) { involved = 0; for ( n = k+1; n < A->nt; n++ ) { - if ( chameleon_involved_in_panelk_2dbc( A, n ) ) { + if ( chameleon_p_involved_in_panelk_2dbc( A, n, p ) ) { involved = 1; break; } } if ( involved ) { INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, - A(k, k), Wu(A->myrank, k) ); + A(k, k), Wu(p, k) ); } } } -#else - INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, - A(k, k), Wu(A->myrank, k) ); -#endif + else { + involved = 0; + for ( n = k+1; n < A->nt; n++ ) { + if ( chameleon_involved_in_panelk_2dbc( A, n ) ) { + involved = 1; + break; + } + } + if ( involved ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(A->myrank, k) ); + } + } RUNTIME_data_flush( options->sequence, A(k, k) ); } +#endif static inline void chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, @@ -667,11 +687,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; CHAM_context_t *chamctxt = chameleon_context_self(); - int m, tempkm, tempmm, tempnn, rankAmn; - - int lookahead = chamctxt->lookahead; - int myq = A->myrank % chameleon_desc_datadist_get_iparam(A, 1); - int lq = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1); + int m, tempkm, tempmm, tempnn; tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); @@ -679,9 +695,12 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, chameleon_pzgetrf_panel_permute_forward( ws, A, ipiv, k, n, options ); #if defined(CHAMELEON_USE_MPI) - if ( ws->involved ) -#endif - { + int rankAmn, p; + int lookahead = chamctxt->lookahead; + int myq = A->myrank % chameleon_desc_datadist_get_iparam(A, 1); + int lq = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1); + + if ( ws->involved ) { INSERT_TASK_ztrsm( options, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, @@ -709,6 +728,31 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, Wu(A->myrank, n), A(k, n) ); } +#else + INSERT_TASK_ztrsm( + options, + ChamLeft, ChamLower, ChamNoTrans, ChamUnit, + tempkm, tempnn, A->mb, + zone, A( k, k ), + Wu( A->myrank, n ) ); + + + for (m = k+1; m < A->mt; m++) { + tempmm = A->get_blkdim( A, m, DIM_m, A->m ); + + INSERT_TASK_zgemm( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, A->mb, A->mb, + mzone, A( m, k ), + Wu( A->myrank, n ), + zone, A( m, n ) ); + } + + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + +#endif RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) ); RUNTIME_data_flush( options->sequence, A(k, n) ); @@ -749,7 +793,9 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } options.forcesub = 0; +#if defined(CHAMELEON_USE_MPI) chameleon_pzgetrf_panel_update_ws( ws, A, k, &options ); +#endif for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; diff --git a/compute/zgetrf.c b/compute/zgetrf.c index d69abaecb3ea9de4f21b5b34c5789f44f4b1380f..2ea2b15d87569f58ac0f8f75bb525fbc550e76a1 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -114,6 +114,8 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( allreduce ); } + ws->batch_size = -1; + ws->batch_adaptive = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_ADAPTIVE", 0 ); batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); if ( batch_size > CHAMELEON_BATCH_SIZE ) { chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); @@ -124,6 +126,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) ws->batch_size_blas3 = ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas3; ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", batch_size ); ws->batch_size_swap = ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_swap; + ws->flops_min = chameleon_max( chameleon_getenv_get_value_int( "CHAMELEON_GETRF_FLOPS_MIN_BATCH", 26e6 ), 1 ); ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); diff --git a/control/compute_z.h b/control/compute_z.h index 812af3dce918e74926506810d38b2db8fc167e33..09de3d0fe02301b5b4e73da47d14751498d58d65 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -47,9 +47,12 @@ struct chameleon_pzgetrf_s { cham_getrf_t alg; cham_getrf_allreduce_t alg_allreduce; int ib; /**< Internal blocking parameter */ + int batch_adaptive; /**< Whether to use adaptative batch or not */ + int batch_size; /**< Batch size */ int batch_size_blas2; /**< Batch size for the blas 2 operations of the panel factorization */ int batch_size_blas3; /**< Batch size for the blas 3 operations of the panel factorization */ int batch_size_swap; /**< Batch size for the permutation */ + int flops_min; int ringswitch; /**< Define when to switch to ring bcast */ CHAM_desc_t U; CHAM_desc_t Up; /**< Workspace used for the panel factorization */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 0ff4ed9854228109928e30ae4b34013338a32a5c..f8b88bc733ae549f819132044f4e42cbb44d9a76 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -320,7 +320,7 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, #endif struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws; int ib = tmp->ib; - int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3; + int batch_size = tmp->batch_size; int task_num = 0; struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr;