diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 706f9fbcb92e9b65c410d67cf0372ad9fa0e3755..d95b415d9bbe5473f38be0d3b304c7c3c898adf1 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -447,6 +447,58 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, } } +static inline void +chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options ) +{ + switch( ws->alg ) { + case ChamGetrfPPiv: + chameleon_attr_fallthrough; + case ChamGetrfPPivPerColumn: + { + int m; + int tempkm, tempkn, tempnn, minmn; + void **clargs = malloc( sizeof(char *) ); + *clargs = NULL; + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Extract selected rows into U */ + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + A(k, n), U(k, n) ); + + /* + * perm array is made of size tempkm for the first row especially. + * Otherwise, the final copy back to the tile may copy only a partial tile + */ + INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, + ipiv, k, A(k, n), U(k, n) ); + + for(m=k+1; m<A->mt; m++){ + INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, k, m, n, (void *)ws, + ipiv, k, A, &(ws->U), clargs ); + } + INSERT_TASK_zlaswp_batched_flush( options, k, n, ipiv, k, A, &(ws->U), clargs ); + + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + U(k, n), A(k, n) ); + + RUNTIME_data_flush( options->sequence, U(k, n) ); + free( clargs ); + } + break; + default: + ; + } +} + static inline void chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -463,7 +515,12 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + if ( ws->batch_size > 0 ) { + chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); + } + else { + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + } INSERT_TASK_ztrsm( options, @@ -536,11 +593,21 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } /* Backward pivoting */ - for (k = 1; k < min_mnt; k++) { - for (n = 0; n < k; n++) { - chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); + if ( ws->batch_size > 0 ) { + for (k = 1; k < min_mnt; k++) { + for (n = 0; n < k; n++) { + chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options ); + } + RUNTIME_perm_flushk( sequence, IPIV, k ); + } + } + else { + for (k = 1; k < min_mnt; k++) { + for (n = 0; n < k; n++) { + chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); + } + RUNTIME_perm_flushk( sequence, IPIV, k ); } - RUNTIME_perm_flushk( sequence, IPIV, k ); } /* Initialize IPIV with default values if needed */