diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index cfe7f507ede116ca6f0385f0730b4fe96abf4891..b4f1de9ff1848677a1177a609abe3d1c788ef46e 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -23,10 +23,11 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define U(m,n) &(ws->U), m, n +#define A(m,n) A, m, n +#define U(m,n) &(ws->U), m, n #define Up(m,n) &(ws->Up), m, n #define Wu(m,n) &(ws->Wu), m, n +#define Wl(m,n) &(ws->Wl), m, n /* * All the functions below are panel factorization variant. @@ -389,6 +390,7 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, { int m; int tempkm, tempkn, tempnn, minmn; + int withlacpy; chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { @@ -409,8 +411,11 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, minmn = chameleon_min( tempkm, tempkn ); /* Extract selected rows into U */ + withlacpy = options->withlacpy; + options->withlacpy = 1; INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, A(k, n), Wu(A->myrank, n) ); + options->withlacpy = withlacpy; /* * perm array is made of size tempkm for the first row especially. @@ -451,6 +456,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, { int m; int tempkm, tempkn, tempnn, minmn; + int withlacpy; chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { @@ -474,8 +480,11 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, minmn = chameleon_min( tempkm, tempkn ); /* Extract selected rows into U */ + withlacpy = options->withlacpy; + options->withlacpy = 1; INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, A(k, n), Wu(A->myrank, n) ); + options->withlacpy = withlacpy; /* * perm array is made of size tempkm for the first row especially. @@ -500,6 +509,41 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } } +static inline void +chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + int k, + RUNTIME_option_t *options ) +{ + CHAM_context_t *chamctxt = chameleon_context_self(); + int m, tempmm, tempkn, q; + int lookahead = chamctxt->lookahead; + int lq = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1); + int myp = A->myrank / chameleon_desc_datadist_get_iparam(A, 1); + + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + + for ( m = k+1; m < A->mt; m++ ) { + if ( m % chameleon_desc_datadist_get_iparam(A, 0) != myp ) continue; + + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkn, + A( m, k ), + Wl( m, ( k % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) ); + + for ( q = 1; q < chameleon_desc_datadist_get_iparam(A, 1); q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkn, + Wl( m, ( ( k + q - 1 ) % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ), + Wl( m, ( ( k + q ) % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) ); + } + RUNTIME_data_flush( options->sequence, A(m, k) ); + } +} + static inline void chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -510,9 +554,14 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, { const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; + CHAM_context_t *chamctxt = chameleon_context_self(); int m, tempkm, tempmm, tempnn, rankAmn, p; + int lookahead = chamctxt->lookahead; + int myq = A->myrank % chameleon_desc_datadist_get_iparam(A, 1); + int lq = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1); + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -531,6 +580,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm, tempnn, A->mb, zone, A(k, k), Wu(ws->proc_involved[p], n) ); + RUNTIME_data_flush( options->sequence, Wu(ws->proc_involved[p], n) ); } } else if ( ws->involved ) { @@ -543,16 +593,18 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, } for (m = k+1; m < A->mt; m++) { - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; rankAmn = A->get_rankof( A, m, n ); - INSERT_TASK_zgemm( - options, - ChamNoTrans, ChamNoTrans, - tempmm, tempnn, A->mb, A->mb, - mzone, A(m, k), - Wu(rankAmn, n), - zone, A(m, n) ); + if ( A->myrank == rankAmn ) { + INSERT_TASK_zgemm( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, A->mb, A->mb, + mzone, Wl( m, myq + lq ), + Wu( A->myrank, n ), + zone, A( m, n ) ); + } } if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { @@ -561,6 +613,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, } RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) ); + RUNTIME_data_flush( options->sequence, A(k, k) ); RUNTIME_data_flush( options->sequence, A(k, n) ); } @@ -599,6 +652,8 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } options.forcesub = 0; + chameleon_pzgetrf_panel_update_ws( ws, A, k, &options ); + for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; if ( chameleon_involved_in_panelk_2dbc( A, k ) || @@ -615,6 +670,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, RUNTIME_iteration_pop( chamctxt ); } + CHAMELEON_Desc_Flush( &(ws->Wl), sequence ); /* Backward pivoting */ if ( ws->batch_size > 0 ) { @@ -657,6 +713,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, RUNTIME_perm_flushk( sequence, IPIV, k ); } } + CHAMELEON_Desc_Flush( &(ws->Wu), sequence ); /* Initialize IPIV with default values if needed */ if ( (ws->alg == ChamGetrfNoPivPerColumn) || diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 9dabf735d3280d77f92ea30388a291a2db8cb861..9c3377bda312d3abe14a5d5dce92cbd26e547386 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -57,6 +57,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) { CHAM_context_t *chamctxt; struct chameleon_pzgetrf_s *ws; + int lookahead; chamctxt = chameleon_context_self(); if ( chamctxt == NULL ) { @@ -127,6 +128,12 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) A->mb * chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), A->n, 0, 0, A->mb * chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), A->n, chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), 1, NULL, NULL, NULL, A->get_rankof_init_arg ); + lookahead = chamctxt->lookahead; + chameleon_desc_init( &(ws->Wl), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, A->mb, A->nb, (A->mb * A->nb), + A->mt * A->mb, A->nb * chameleon_desc_datadist_get_iparam(A, 1) * lookahead, 0, 0, + A->mt * A->mb, A->nb * chameleon_desc_datadist_get_iparam(A, 1) * lookahead, chameleon_desc_datadist_get_iparam(A, 0), chameleon_desc_datadist_get_iparam(A, 1), + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); } /* Set ib to 1 if per column algorithm */ @@ -195,6 +202,7 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) ( ws->alg == ChamGetrfPPivPerColumn ) ) { chameleon_desc_destroy( &(ws->Wu) ); + chameleon_desc_destroy( &(ws->Wl) ); } free( ws ); } @@ -394,6 +402,7 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV ) ws = CHAMELEON_zgetrf_WS_Alloc( A ); CHAMELEON_zgetrf_Tile_Async( A, IPIV, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Ipiv_Flush( IPIV, sequence ); chameleon_sequence_wait( chamctxt, sequence ); CHAMELEON_zgetrf_WS_Free( ws ); diff --git a/control/compute_z.h b/control/compute_z.h index df4b18d79010af05a0d5f482b7e9d9390442c29a..f1002522d6f430c4cbf58251c39c1604526f4dac 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -49,6 +49,7 @@ struct chameleon_pzgetrf_s { CHAM_desc_t U; CHAM_desc_t Up; /**< Workspace used for the panel factorization */ CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ + CHAM_desc_t Wl; /**< Workspace used the update */ int *proc_involved; unsigned int involved; int np_involved;