diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 368a3d46f26124035b35fa9be1deba42619ca0a8..1060432565ee5d581cd82bc74da7c53df0829383 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -24,50 +24,6 @@ #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n -#define IPIV(m) IPIV, m, 1 - -/* - * Static variable to know how to handle the data within the kernel - * This assumes that only one runtime is enabled at a time. - */ -static RUNTIME_id_t zgetrf_runtime_id = RUNTIME_SCHED_STARPU; - -static inline int -zgetrf_ipiv_init( const CHAM_desc_t *descIPIV, - cham_uplo_t uplo, int m, int n, - CHAM_tile_t *tileIPIV, void *op_args ) -{ - int *IPIV; - (void)op_args; - - if ( zgetrf_runtime_id == RUNTIME_SCHED_PARSEC ) { - IPIV = (int*)tileIPIV; - } - else { - IPIV = CHAM_tile_get_ptr( tileIPIV ); - } - -#if !defined(CHAMELEON_SIMULATION) - { - int tempmm = m == descIPIV->mt-1 ? descIPIV->m - m * descIPIV->mb : descIPIV->mb; - int i; - - for( i=0; i<tempmm; i++ ) { - IPIV[i] = m * descIPIV->mb + i + 1; - } - } -#endif - - return 0; -} - -static inline void -chameleon_pzgetrf_ipiv_init( CHAM_desc_t *IPIV, - RUNTIME_sequence_t *sequence, - RUNTIME_request_t *request ) -{ - chameleon_pmap( ChamW, ChamUpperLower, IPIV, zgetrf_ipiv_init, NULL, sequence, request ); -} /* * All the functions below are panel factorization variant. @@ -79,10 +35,10 @@ chameleon_pzgetrf_ipiv_init( CHAM_desc_t *IPIV, * @param[inout] A * The descriptor of the full matrix A (not just the panel) * - * @param[in] k - * The index of the column to factorize + * @param[inout] ipiv + * The descriptor of the pivot array associated to A. * - * @param[in] ib + * @param[in] k * The index of the column to factorize * * @param[inout] options @@ -91,6 +47,7 @@ chameleon_pzgetrf_ipiv_init( CHAM_desc_t *IPIV, static inline void chameleon_pzgetrf_panel_facto_nopiv( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { @@ -122,6 +79,7 @@ chameleon_pzgetrf_panel_facto_nopiv( struct chameleon_pzgetrf_s *ws, static inline void chameleon_pzgetrf_panel_facto_nopiv_percol( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { @@ -151,18 +109,79 @@ chameleon_pzgetrf_panel_facto_nopiv_percol( struct chameleon_pzgetrf_s *ws, RUNTIME_data_flush( options->sequence, U(k, k) ); } +static inline void +chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + RUNTIME_option_t *options ) +{ + int m, h; + int tempkm, tempkn, minmn; + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Update the number of column */ + ipiv->n = minmn; + + /* + * Algorithm per column with pivoting + */ + for (h=0; h<=minmn; h++){ + + INSERT_TASK_zgetrf_percol_diag( + options, + h, k * A->mb, + A(k, k), + ipiv ); + + for (m = k+1; m < A->mt; m++) { + //tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + + INSERT_TASK_zgetrf_percol_offdiag( + options, + h, m * A->mb, + A(m, k), + ipiv ); + } + + if ( h < minmn ) { + /* Reduce globally (between MPI processes) */ + RUNTIME_ipiv_reducek( options, ipiv, k, h ); + } + } + + /* Flush temporary data used for the pivoting */ + RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); +} + static inline void chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, int k, RUNTIME_option_t *options ) { /* TODO: Should be replaced by a function pointer */ - if ( ws->alg == ChamGetrfNoPivPerColumn ) { - chameleon_pzgetrf_panel_facto_nopiv_percol( ws, A, k, options ); - } - else { - chameleon_pzgetrf_panel_facto_nopiv( ws, A, k, options ); + switch( ws->alg ) { + case ChamGetrfNoPivPerColumn: + chameleon_pzgetrf_panel_facto_nopiv_percol( ws, A, ipiv, k, options ); + break; + + case ChamGetrfPPivPerColumn: + chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); + break; + + case ChamGetrfPPiv: + chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); + break; + + case ChamGetrfNoPiv: + chameleon_attr_fallthrough; + default: + chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options ); } } @@ -227,7 +246,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, */ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, - CHAM_desc_t *IPIV, + CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { @@ -243,14 +262,11 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - /* Initialize IPIV */ - chameleon_pzgetrf_ipiv_init( IPIV, sequence, request ); - for (k = 0; k < min_mnt; k++) { RUNTIME_iteration_push( chamctxt, k ); options.priority = A->nt; - chameleon_pzgetrf_panel_facto( ws, A, k, &options ); + chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options ); for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; @@ -272,5 +288,12 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } } + /* Initialize IPIV */ + if ( (ws->alg == ChamGetrfNoPivPerColumn) || + (ws->alg == ChamGetrfNoPiv ) ) + { + RUNTIME_ipiv_init( IPIV ); + } + RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 9d1b19806343d54840eb2da435cdeb7bb5939cb5..73c810be2c1f3294583b6599aff49e726f8f049d 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -68,15 +68,21 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) { char *algostr = chameleon_getenv( "CHAMELEON_GETRF_ALGO" ); - if ( algostr ) { - if ( strcasecmp( algostr, "nopiv" ) ) { + if ( algostr != NULL ) { + if ( strcasecmp( algostr, "nopiv" ) == 0 ) { ws->alg = ChamGetrfNoPiv; } else if ( strcasecmp( algostr, "nopivpercolumn" ) == 0 ) { ws->alg = ChamGetrfNoPivPerColumn; } + else if ( strcasecmp( algostr, "ppiv" ) == 0 ) { + ws->alg = ChamGetrfPPiv; + } + else if ( strcasecmp( algostr, "ppivpercolumn" ) == 0 ) { + ws->alg = ChamGetrfPPivPerColumn; + } else { - fprintf( stderr, "ERROR: CHAMELEON_GETRF_ALGO is not one of NoPiv, NoPivPerColumn => Switch back to NoPiv\n" ); + chameleon_error( "CHAMELEON_zgetrf_WS_Alloc", "CHAMELEON_GETRF_ALGO is not one of NoPiv, NoPivPerColumn, PPiv, PPivPerColumn => Switch back to NoPiv\n" ); } } chameleon_cleanenv( algostr ); @@ -90,6 +96,13 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); } + /* Set ib to 1 if per column algorithm */ + if ( ( ws->alg == ChamGetrfNoPivPerColumn ) || + ( ws->alg == ChamGetrfPPivPerColumn ) ) + { + ws->ib = 1; + } + return ws; } @@ -123,7 +136,6 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) free( ws ); } -#if defined(NOT_AVAILABLE_YET) /** ******************************************************************************** * @@ -149,6 +161,11 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) * @param[in] LDA * The leading dimension of the array A. LDA >= max(1,M). * + * @param[out] IPIV + * Integer array of dimension min(M,N). + * The pivot indices; for 1 <= i <= min(M,N), row i of the + * matrix was interchanged with row IPIV(i). + * ******************************************************************************* * * @retval CHAMELEON_SUCCESS successful exit @@ -173,10 +190,11 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) int NB; int status; CHAM_desc_t descAl, descAt; + CHAM_ipiv_t descIPIV; CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; - void *ws; + struct chameleon_pzgetrf_s *ws; chamctxt = chameleon_context_self(); if ( chamctxt == NULL ) { @@ -219,25 +237,35 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) /* Allocate workspace for partial pivoting */ ws = CHAMELEON_zgetrf_WS_Alloc( &descAt ); + if ( ws->alg == ChamGetrfPPivPerColumn ) { + chameleon_ipiv_init( &descIPIV, &descAt, IPIV ); + } + /* Call the tile interface */ - CHAMELEON_zgetrf_Tile_Async( &descAt, ws, sequence, &request ); + CHAMELEON_zgetrf_Tile_Async( &descAt, &descIPIV, ws, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, sequence, &request ); + if ( ws->alg == ChamGetrfPPivPerColumn ) { + RUNTIME_ipiv_gather( &descIPIV, IPIV, 0 ); + } chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ CHAMELEON_zgetrf_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + if ( ws->alg == ChamGetrfPPivPerColumn ) { + chameleon_ipiv_destroy( &descIPIV ); + } + status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); return status; } -#endif /** ******************************************************************************** @@ -255,12 +283,19 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) * On entry, the M-by-N matrix to be factored. * On exit, the tile factors L and U from the factorization. * + * @param[in,out] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * ******************************************************************************* * * @retval CHAMELEON_SUCCESS successful exit - * @retval >0 if i, U(i,i) is exactly zero. The factorization has been completed, - * but the factor U is exactly singular, and division by zero will occur - * if it is used to solve a system of equations. + * @retval >0 if i, U(i,i) is exactly zero. The factorization has been + * completed, but the factor U is exactly singular, and division + * by zero will occur if it is used to solve a system of + * equations. * ******************************************************************************* * @@ -273,7 +308,7 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) * */ int -CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ) +CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; @@ -317,9 +352,10 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ) * On exit, the tile factors L and U from the factorization. * * @param[in,out] IPIV - * On entry, the descriptor of an min(M, N)-by-1 matrix that may not - * have been initialized. - * On exit, the pivot vector generated during the factorization. + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. * * @param[in,out] user_ws * The opaque pointer to pre-allocated getrf workspace through @@ -346,7 +382,7 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ) */ int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, - CHAM_desc_t *IPIV, + CHAM_ipiv_t *IPIV, void *user_ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) @@ -384,10 +420,6 @@ CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, chameleon_error( "CHAMELEON_zgetrf_Tile", "invalid first descriptor" ); return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); } - if ( chameleon_desc_check( IPIV ) != CHAMELEON_SUCCESS ) { - chameleon_error( "CHAMELEON_zgetrf_Tile", "invalid second descriptor" ); - return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); - } /* Check input arguments */ if ( A->nb != A->mb ) { @@ -398,10 +430,6 @@ CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, chameleon_error( "CHAMELEON_zgetrf_Tile", "IPIV tiles must have the number of rows as tiles of A" ); return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); } - if ( IPIV->nb != 1 ) { - chameleon_error( "CHAMELEON_zgetrf_Tile", "IPIV tiles must be vectore with only one column per tile" ); - return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); - } if ( user_ws == NULL ) { ws = CHAMELEON_zgetrf_WS_Alloc( A ); diff --git a/control/compute_z.h b/control/compute_z.h index 06eae17b508c012918bbd011bad9cbb25a7bb7d4..9032c20f24666d4e751b1e422e9afd07b41d047f 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -22,7 +22,7 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-07-06 + * @date 2023-08-22 * @precisions normal z -> c d s * */ @@ -94,7 +94,7 @@ void chameleon_pzgepdf_qdwh( cham_mtxtype_t trans, CHAM_desc_t *descU, CHAM_desc void chameleon_pzgepdf_qr( int genD, int doqr, int optid, const libhqr_tree_t *qrtreeT, const libhqr_tree_t *qrtreeB, CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *D1, CHAM_desc_t *Q1, CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *D2, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_desc_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgetrf_incpiv(CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index fa5f069e6057cb0f79eb47a7f0e2bbc38777a14b..2362d8b1a8ff0b1e8b4ff96b26a795d71f25761d 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -23,7 +23,7 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2023-07-06 + * @date 2023-08-22 * @precisions normal z -> c d s * */ @@ -135,7 +135,7 @@ int CHAMELEON_zgesvd_Tile(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, dou //int CHAMELEON_zgetrf_Tile(CHAM_desc_t *A, int *IPIV); int CHAMELEON_zgetrf_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV); int CHAMELEON_zgetrf_nopiv_Tile(CHAM_desc_t *A); -int CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_desc_t *IPIV ); +int CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV ); //int CHAMELEON_zgetri_Tile(CHAM_desc_t *A, int *IPIV); //int CHAMELEON_zgetrs_Tile(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B); int CHAMELEON_zgetrs_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B); @@ -216,7 +216,7 @@ int CHAMELEON_zgesvd_Tile_Async(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t * //int CHAMELEON_zgetrf_Tile_Async(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrf_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *IPIV, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, CHAM_ipiv_t *IPIV, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); //int CHAMELEON_zgetri_Tile_Async(CHAM_desc_t *A, int *IPIV, CHAM_desc_t *W, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); //int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);