/** * * @file pzgetrf.c * * @copyright 2009-2014 The University of Tennessee and The University of * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * * @brief Chameleon zgetrf parallel algorithm * * @version 1.3.0 * @author Omar Zenati * @author Mathieu Faverge * @author Emmanuel Agullo * @author Matthieu Kuhn * @author Alycia Lisito * @date 2024-03-16 * @precisions normal z -> s d c * */ #include "control/common.h" #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n #define Up(m,n) &(ws->Up), m, n /* * All the functions below are panel factorization variant. * The parameters are: * @param[inout] ws * The data structure associated to the algorithm that holds all extra * information that may be needed for LU factorization * * @param[inout] A * The descriptor of the full matrix A (not just the panel) * * @param[inout] ipiv * The descriptor of the pivot array associated to A. * * @param[in] k * The index of the column to factorize * * @param[inout] options * The runtime options data structure to pass through all insert_task calls. */ static inline void chameleon_pzgetrf_panel_facto_nopiv( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; int m, tempkm, tempkn, tempmm; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; /* * Algorithm per block without pivoting */ INSERT_TASK_zgetrf_nopiv( options, tempkm, tempkn, ws->ib, A->mb, A(k, k), 0); for (m = k+1; m < A->mt; m++) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_ztrsm( options, ChamRight, ChamUpper, ChamNoTrans, ChamNonUnit, tempmm, tempkn, A->mb, zone, A(k, k), A(m, k) ); } } static inline void chameleon_pzgetrf_panel_facto_nopiv_percol( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { int m, h; int tempkm, tempkn, tempmm, minmn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* * Algorithm per column without pivoting */ for(h=0; h<minmn; h++){ INSERT_TASK_zgetrf_nopiv_percol_diag( options, tempkm, tempkn, h, A( k, k ), U( k, k ), A->mb * k ); for (m = k+1; m < A->mt; m++) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_nopiv_percol_trsm( options, tempmm, tempkn, h, A( m, k ), U( k, k ) ); } } RUNTIME_data_flush( options->sequence, U(k, k) ); } static inline void chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { int m, h; int tempkm, tempkn, tempmm, minmn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* Update the number of column */ ipiv->n = minmn; /* * Algorithm per column with pivoting */ for (h=0; h<=minmn; h++){ INSERT_TASK_zgetrf_percol_diag( options, tempkm, tempkn, h, k * A->mb, A(k, k), ipiv ); for (m = k+1; m < A->mt; m++) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_percol_offdiag( options, tempmm, tempkn, h, m * A->mb, A(m, k), ipiv ); } /* Reduce globally (between MPI processes) */ INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); } /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } /* * Factorization of panel k - dynamic scheduling - batched version / stock */ static inline void chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { int m, h; int tempkm, tempkn, tempmm, minmn; void **clargs = malloc( sizeof(char *) ); memset( clargs, 0, sizeof(char *) ); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* Update the number of column */ ipiv->n = minmn; /* * Algorithm per column with pivoting (no recursion) */ /* Iterate on current panel column */ /* Since index h scales column h-1, we need to iterate up to minmn (included) */ for ( h = 0; h <= minmn; h++ ) { INSERT_TASK_zgetrf_percol_diag( options, tempkm, tempkn, h, k * A->mb, A(k, k), ipiv ); for ( m = k+1; m < A->mt; m++ ) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_panel_offdiag_batched( options, tempmm, tempkn, h, m * A->mb, (void *)ws, A(m, k), clargs, ipiv ); } INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); } free( clargs ); /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } static inline void chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { int m, h, b, nbblock; int tempkm, tempkn, tempmm, minmn; if ( ! ws->involved ) { return; } tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* Update the number of column */ ipiv->n = minmn; nbblock = chameleon_ceil( minmn, ws->ib ); /* * Algorithm per column with pivoting */ for (b=0; b<nbblock; b++){ int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib; for (h=0; h<hmax; h++){ int j = h + b * ws->ib; INSERT_TASK_zgetrf_blocked_diag( options, tempkm, tempkn, j, k * A->mb, ws->ib, A(k, k), Up(k, k), ipiv ); for (m = k+1; m < A->mt; m++) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_blocked_offdiag( options, tempmm, tempkn, j, m * A->mb, ws->ib, A(m, k), Up(k, k), ipiv ); } assert( j <= minmn ); /* Reduce globally (between MPI processes) */ INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) { INSERT_TASK_zgetrf_blocked_trsm( options, ws->ib, tempkn, j+1, ws->ib, Up(k, k), ipiv ); } } } RUNTIME_data_flush( options->sequence, Up(k, k) ); /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } /* * Factorization of panel k - dynamic scheduling - batched version / stock */ static inline void chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { int m, h, b, nbblock, hmax, j; int tempkm, tempkn, tempmm, minmn; void **clargs = malloc( sizeof(char *) ); memset( clargs, 0, sizeof(char *) ); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* Update the number of column */ ipiv->n = minmn; nbblock = chameleon_ceil( minmn, ws->ib ); /* * Algorithm per column with pivoting (no recursion) */ /* Iterate on current panel column */ /* Since index h scales column h-1, we need to iterate up to minmn (included) */ for ( b = 0; b < nbblock; b++ ) { hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib; for ( h = 0; h < hmax; h++ ) { j = h + b * ws->ib; for ( m = k; m < A->mt; m++ ) { tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb, (void *)ws, A(m, k), Up(k, k), clargs, ipiv ); } INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, k, Up(k, k), clargs, ipiv ); assert( j <= minmn ); /* Reduce globally (between MPI processes) */ INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); if ( (b < (nbblock-1)) && (h == hmax-1) ) { INSERT_TASK_zgetrf_blocked_trsm( options, ws->ib, tempkn, b * ws->ib + hmax, ws->ib, Up(k, k), ipiv ); } } } free( clargs ); /* Flush temporary data used for the pivoting */ INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } static inline void chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { #if defined ( CHAMELEON_USE_MPI ) int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) ); int b; /* 2DBC only */ ws->involved = 0; for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) { int rank = chameleon_getrankof_2d( A, b, k ); proc_involved[ b-k ] = rank; if ( rank == A->myrank ) { ws->involved = 1; } } ws->proc_involved = proc_involved; if ( ws->involved == 0 ) { free( proc_involved ); return; } #endif /* TODO: Should be replaced by a function pointer */ switch( ws->alg ) { case ChamGetrfNoPivPerColumn: chameleon_pzgetrf_panel_facto_nopiv_percol( ws, A, ipiv, k, options ); break; case ChamGetrfPPivPerColumn: if ( ws->batch_size > 0 ) { chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options ); } else { chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); } break; case ChamGetrfPPiv: if ( ws->batch_size > 0 ) { chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options ); } else { chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options ); } break; case ChamGetrfNoPiv: chameleon_attr_fallthrough; default: chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options ); } #if defined ( CHAMELEON_USE_MPI ) free( proc_involved ); #endif } /** * Permutation of the panel n at step k */ static inline void chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, int n, RUNTIME_option_t *options ) { switch( ws->alg ) { case ChamGetrfPPiv: chameleon_attr_fallthrough; case ChamGetrfPPivPerColumn: { int m; int tempkm, tempkn, tempnn, minmn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); /* Extract selected rows into U */ INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, A(k, n), U(k, n) ); /* * perm array is made of size tempkm for the first row especially. * Otherwise, the final copy back to the tile may copy only a partial tile */ INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, ipiv, k, A(k, n), U(k, n) ); for(m=k+1; m<A->mt; m++){ /* Extract selected rows into A(k, n) */ INSERT_TASK_zlaswp_get( options, m*A->mb, minmn, ipiv, k, A(m, n), U(k, n) ); /* Copy rows from A(k,n) into their final position */ INSERT_TASK_zlaswp_set( options, m*A->mb, minmn, ipiv, k, A(k, n), A(m, n) ); } INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, U(k, n), A(k, n) ); RUNTIME_data_flush( options->sequence, U(k, n) ); } break; default: ; } } static inline void chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *ipiv, int k, int n, RUNTIME_option_t *options ) { const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; int m, tempkm, tempmm, tempnn; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); INSERT_TASK_ztrsm( options, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, tempkm, tempnn, A->mb, zone, A(k, k), A(k, n) ); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, mzone, A(m, k), A(k, n), zone, A(m, n) ); } RUNTIME_data_flush( options->sequence, A(k, n) ); } /** * Parallel tile LU factorization with no pivoting - dynamic scheduling */ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; int k, m, n; int min_mnt = chameleon_min( A->mt, A->nt ); chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { return; } RUNTIME_options_init( &options, chamctxt, sequence, request ); for (k = 0; k < min_mnt; k++) { RUNTIME_iteration_push( chamctxt, k ); options.priority = A->nt; /* * Do the panel factorization only if the current proc contributes in the * block column k. */ options.forcesub = chameleon_involved_in_panelk_2dbc( A, k ); if ( chameleon_involved_in_panelk_2dbc( A, k ) ) { chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options ); } options.forcesub = 0; for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options ); } /* Flush panel k */ for (m = k; m < A->mt; m++) { RUNTIME_data_flush( sequence, A(m, k) ); } RUNTIME_iteration_pop( chamctxt ); } /* Backward pivoting */ for (k = 1; k < min_mnt; k++) { for (n = 0; n < k; n++) { chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); } RUNTIME_perm_flushk( sequence, IPIV, k ); } /* Initialize IPIV with default values if needed */ if ( (ws->alg == ChamGetrfNoPivPerColumn) || (ws->alg == ChamGetrfNoPiv ) ) { INSERT_TASK_ipiv_init( &options, IPIV ); } RUNTIME_options_finalize( &options, chamctxt ); }