diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index b4f1de9ff1848677a1177a609abe3d1c788ef46e..8c41f81d550fa77e1c5bb0fd6d6dc59af3b0dea4 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -518,29 +518,48 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, CHAM_context_t *chamctxt = chameleon_context_self(); int m, tempmm, tempkn, q; int lookahead = chamctxt->lookahead; - int lq = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1); - int myp = A->myrank / chameleon_desc_datadist_get_iparam(A, 1); + int P = chameleon_desc_datadist_get_iparam(A, 0); + int Q = chameleon_desc_datadist_get_iparam(A, 1); + int lq = (k % lookahead) * Q; + int myp = A->myrank / Q; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - for ( m = k+1; m < A->mt; m++ ) { - if ( m % chameleon_desc_datadist_get_iparam(A, 0) != myp ) continue; - - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkn, - A( m, k ), - Wl( m, ( k % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) ); + if ( k >= ws->ringswitch ) { + for ( m = k+1; m < A->mt; m++ ) { + if ( ( m % P ) != myp ) continue; - for ( q = 1; q < chameleon_desc_datadist_get_iparam(A, 1); q++ ) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkn, - Wl( m, ( ( k + q - 1 ) % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ), - Wl( m, ( ( k + q ) % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) ); + A( m, k ), + Wl( m, ( k % Q ) + lq ) ); + + for ( q = 1; q < Q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkn, + Wl( m, ( ( k + q - 1 ) % Q ) + lq ), + Wl( m, ( ( k + q ) % Q ) + lq ) ); + } + RUNTIME_data_flush( options->sequence, A(m, k) ); + } + } + else { + for ( m = k+1; m < A->mt; m++ ) { + if ( ( m % P ) != myp ) continue; + + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + for ( q = 0; q < Q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkn, + A( m, k ), + Wl( m, ( ( k + q )% Q ) + lq ) ); + } + RUNTIME_data_flush( options->sequence, A(m, k) ); } - RUNTIME_data_flush( options->sequence, A(m, k) ); } } diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 9c3377bda312d3abe14a5d5dce92cbd26e547386..514e89d3e375a38d487efcddf6aee07505660f00 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -27,6 +27,7 @@ * */ #include "control/common.h" +#include <limits.h> /** ******************************************************************************** @@ -103,6 +104,8 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) ws->batch_size = CHAMELEON_BATCH_SIZE; } + ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); + /* Allocation of U for permutation of the panels */ if ( ws->alg == ChamGetrfNoPivPerColumn ) { chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE, diff --git a/control/compute_z.h b/control/compute_z.h index f1002522d6f430c4cbf58251c39c1604526f4dac..b75c303a1131e5c9f142803e66a227fe0c04de91 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -44,12 +44,13 @@ struct chameleon_pzgemm_s { */ struct chameleon_pzgetrf_s { cham_getrf_t alg; - int ib; /**< Internal blocking parameter */ - int batch_size; /**< Batch size for the panel */ + int ib; /**< Internal blocking parameter */ + int batch_size; /**< Batch size for the panel */ + int ringswitch; /**< Define when to switch to ring bcast */ CHAM_desc_t U; - CHAM_desc_t Up; /**< Workspace used for the panel factorization */ - CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ - CHAM_desc_t Wl; /**< Workspace used the update */ + CHAM_desc_t Up; /**< Workspace used for the panel factorization */ + CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ + CHAM_desc_t Wl; /**< Workspace used the update */ int *proc_involved; unsigned int involved; int np_involved;