diff --git a/ChangeLog b/ChangeLog index 545761d09776fcc7b48035e90992eff90d4b6227..4d53148cf1e02349cbb1c7b81ae9eb16c7e9705e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ chameleon-1.4.0 ------------------------------------------------------------------------ + - Add the laswp driver and testing. Be careful, the interface does not follow the lapack API to propose the op(P) \times A, or A \times op(P) operation with op(P) equal to P or P^{-1} + - Add the gesv driver to perform LU factorization and solve with its associated testing. - StarPU: Update the minimum requirement from 1.3 to 1.4 - StarPU: When using starpu > 1.4.8, use the new distributed submit interface in the codelets instead of the classical insert task interface. - ci: use -Werror to prevent from adding warning to the code diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 5626c253c38ebd7d6b2e9357f565174879be2ba9..573921e49b789ddb19e9379e845f4168b718d9f1 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -28,7 +28,8 @@ # @author Loris Lucido # @author Matthieu Kuhn # @author Ana Hourcau -# @date 2024-09-18 +# @author Matteo Marcos +# @date 2025-03-24 # ### @@ -150,7 +151,7 @@ set(ZSRC zgepdf_qr.c zgeqrs.c zgeqrs_param.c - #zgesv.c + zgesv.c zgesv_incpiv.c zgesv_nopiv.c #zgetrf.c @@ -159,6 +160,7 @@ set(ZSRC zgetrf.c zgetrs_incpiv.c zgetrs_nopiv.c + zgetrs.c zlacpy.c zlange.c zlanhe.c @@ -219,7 +221,7 @@ set(ZSRC #pzhetrd_hb2ht.c pzhetrd_he2hb.c #pzlarft_blgtrd.c - #pzlaswp.c + pzlaswp.c #pzlaswpc.c #pztrsmrv.c #pzunmqr_blgtrd.c @@ -237,7 +239,7 @@ set(ZSRC #zhegv.c #zhegvd.c zhetrd.c - #zlaswp.c + zlaswp.c #zlaswpc.c #ztrsmrv.c ################## diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 9a6ba171ac622a59b6fadfb9f3726ae57ddc3254..448bb4309a50a8eb7d7c0b5d46bed5569688fab2 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -17,7 +17,8 @@ * @author Emmanuel Agullo * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2025-01-24 + * @author Matteo Marcos + * @date 2025-03-24 * @precisions normal z -> s d c * */ @@ -154,7 +155,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, } /* Flush temporary data used for the pivoting */ - INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, 0, A->m, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } @@ -202,7 +203,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, free( clargs ); /* Flush temporary data used for the pivoting */ - INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, 0, A->m, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } @@ -264,7 +265,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, RUNTIME_data_flush( options->sequence, Up(k, k) ); /* Flush temporary data used for the pivoting */ - INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, 0, A->m, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } @@ -327,7 +328,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, free( clargs ); /* Flush temporary data used for the pivoting */ - INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, 0, A->m, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank ); } @@ -410,19 +411,19 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, * perm array is made of size tempkm for the first row especially. * Otherwise, the final copy back to the tile may copy only a partial tile */ - INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, + INSERT_TASK_zlaswp_get( options, ChamDirForward, k*A->mb, tempkm, ipiv, k, A(k, n), Wu(A->myrank, n) ); for(m=k+1; m<A->mt; m++){ /* Extract selected rows into A(k, n) */ - INSERT_TASK_zlaswp_get( options, m*A->mb, minmn, + INSERT_TASK_zlaswp_get( options, ChamDirForward, m*A->mb, minmn, ipiv, k, A(m, n), Wu(A->myrank, n) ); /* Copy rows from A(k,n) into their final position */ - INSERT_TASK_zlaswp_set( options, m*A->mb, minmn, + INSERT_TASK_zlaswp_set( options, ChamDirForward, m*A->mb, minmn, ipiv, k, A(k, n), A(m, n) ); } - INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); + INSERT_TASK_zperm_allreduce( options, ChamDirForward, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); } break; default: @@ -465,7 +466,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, * perm array is made of size tempkm for the first row especially. * Otherwise, the final copy back to the tile may copy only a partial tile */ - INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, + INSERT_TASK_zlaswp_get( options, ChamDirForward, k*A->mb, tempkm, ipiv, k, A(k, n), Wu(A->myrank, n) ); for(m=k+1; m<A->mt; m++){ @@ -474,7 +475,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); - INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); + INSERT_TASK_zperm_allreduce( options, ChamDirForward, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); free( clargs ); } @@ -495,8 +496,8 @@ chameleon_pzgetrf_panel_permute_forward( struct chameleon_pzgetrf_s *ws, #if defined(CHAMELEON_USE_MPI) chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + INSERT_TASK_zperm_allreduce_send_perm( options, ChamDirForward, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ChamDirForward, ipiv, k, A, k, n ); } if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); @@ -529,8 +530,8 @@ chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws, #if defined(CHAMELEON_USE_MPI) chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + INSERT_TASK_zperm_allreduce_send_perm( options, ChamDirForward, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ChamDirForward, ipiv, k, A, k, n ); } if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); diff --git a/compute/pzlaswp.c b/compute/pzlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..0d4aa8694654dad2b79b8108b52061a696b70ce9 --- /dev/null +++ b/compute/pzlaswp.c @@ -0,0 +1,146 @@ +/** + * + * @file pzlaswp.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zlaswp parallel algorithm + * + * @version 1.3.0 + * @comment This file has been automatically generated + * from Plasma 2.5.0 for CHAMELEON 0.9.2 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#define A(m,n) A, m, n +#define Wu(m,n) &(ws->Wu), m, n + +/** + * Permutation of the panel n at step k + */ +static inline void +chameleon_pzlaswp_panel_permute( struct chameleon_pzgetrf_s *ws, + cham_dir_t dir, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options ) +{ + int m; + int tempkm, tempnn; + int withlacpy; + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + tempnn = A->get_blkdim( A, n, DIM_n, A->n ); + + /* Extract selected rows into U */ + withlacpy = options->withlacpy; + options->withlacpy = 1; + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + A(k, n), Wu(A->myrank, n) ); + options->withlacpy = withlacpy; + + INSERT_TASK_zlaswp_get( options, dir, k*A->mb, tempkm, + ipiv, k, A(k, n), Wu(A->myrank, n) ); + + for ( m = k + 1; m < A->mt; m++ ) { + /* Extract selected rows into A(k, n) */ + INSERT_TASK_zlaswp_get( options, dir, m*A->mb, tempkm, + ipiv, k, A(m, n), Wu(A->myrank, n) ); + /* Copy rows from A(k,n) into their final position */ + INSERT_TASK_zlaswp_set( options, dir, m*A->mb, tempkm, + ipiv, k, A(k, n), A(m, n) ); + } + + INSERT_TASK_zperm_allreduce( options, dir, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); +} + +static inline void +chameleon_pzlaswp_panel( struct chameleon_pzgetrf_s *ws, + cham_dir_t dir, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options, + RUNTIME_sequence_t *sequence ) +{ + int tempkm, tempnn; + +#if defined(CHAMELEON_USE_MPI) + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, dir, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, dir, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } +#endif + + chameleon_pzlaswp_panel_permute( ws, dir, A, ipiv, k, n, options ); + + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + tempnn = A->get_blkdim( A, n, DIM_n, A->n ); + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + RUNTIME_data_flush( sequence, A(k, n) ); + } +} + +void +chameleon_pzlaswp( struct chameleon_pzgetrf_s *ws, + cham_dir_t dir, + CHAM_desc_t *A, + CHAM_ipiv_t *IPIV, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + + int n, k; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + if ( dir == ChamDirForward ) { + for ( k = 0; k < A->mt; k++ ) { + for ( n = 0; n < A->nt; n++ ) { + options.priority = A->nt-n; + + chameleon_pzlaswp_panel( ws, dir, A, IPIV, k, n, &options, sequence ); + } + RUNTIME_perm_flushk( sequence, IPIV, k ); + } + } + else { + for ( k = A->mt - 1; k > -1; k-- ) { + for ( n = 0; n < A->nt; n++ ) { + options.priority = A->nt-n; + chameleon_pzlaswp_panel( ws, dir, A, IPIV, k, n, &options, sequence ); + } + RUNTIME_perm_flushk( sequence, IPIV, k ); + } + } + RUNTIME_options_finalize( &options, chamctxt ); +} + diff --git a/compute/zgesv.c b/compute/zgesv.c new file mode 100644 index 0000000000000000000000000000000000000000..1b657bfac45d684e260f0096fc4192256d65bdba --- /dev/null +++ b/compute/zgesv.c @@ -0,0 +1,387 @@ +/** + * + * @file zgesv.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesv wrappers + * + * @version 1.3.0 + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Computes the solution to a system of linear equations A * X = B, + * where A is an N-by-N matrix and X and B are N-by-NRHS matrices. + * + * The tile LU decomposition with partial tile pivoting and row interchanges is used to factor A. + * The factored form of A is then used to solve the system of equations A * X = B. + * + ******************************************************************************* + * + * @param[in] N + * The number of linear equations, i.e., the order of the matrix A. N >= 0. + * + * @param[in] NRHS + * The number of right hand sides, i.e., the number of columns of the matrix B. + * NRHS >= 0. + * + * @param[in,out] A + * On entry, the N-by-N coefficient matrix A. + * On exit, the tile L and U factors from the factorization (not equivalent to LAPACK). + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + * @param[out] IPIV + * On exit, the pivot indices that define the permutations (not equivalent to LAPACK). + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, if return value = 0, the N-by-NRHS solution matrix X. + * + * @param[in] LDB + * The leading dimension of the array B. LDB >= max(1,N). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * @retval >0 if i, U(i,i) is exactly zero. The factorization has been completed, + * but the factor U is exactly singular, so the solution could not be computed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgesv_Tile + * @sa CHAMELEON_zgesv_Tile_Async + * @sa CHAMELEON_cgesv + * @sa CHAMELEON_dgesv + * @sa CHAMELEON_sgesv + * + */ +int CHAMELEON_zgesv( int N, int NRHS, + CHAMELEON_Complex64_t *A, int LDA, + int *IPIV, + CHAMELEON_Complex64_t *B, int LDB ) +{ + int NB; + int status; + CHAM_context_t *chamctxt; + CHAM_ipiv_t descIPIV; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + CHAM_desc_t descBl, descBt; + struct chameleon_pzgetrf_s *wsA, *wsB; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_error( "CHAMELEON_zgesv", "CHAMELEON not initialized" ); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if ( N < 0 ) { + chameleon_error( "CHAMELEON_zgesv", "illegal value of N" ); + return -1; + } + if ( NRHS < 0 ) { + chameleon_error( "CHAMELEON_zgesv", "illegal value of NRHS" ); + return -2; + } + if ( LDA < chameleon_max( 1, N ) ) { + chameleon_error( "CHAMELEON_zgesv", "illegal value of LDA" ); + return -4; + } + if ( LDB < chameleon_max( 1, N ) ) { + chameleon_error( "CHAMELEON_zgesv", "illegal value of LDB" ); + return -8; + } + /* Quick return */ + if ( chameleon_min( N, NRHS ) == 0 ) { + return CHAMELEON_SUCCESS; + } + + /* Tune NB & IB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune( CHAMELEON_FUNC_ZGESV, N, N, NRHS ); + if ( status != CHAMELEON_SUCCESS ) { + chameleon_error( "CHAMELEON_zgesv", "chameleon_tune() failed" ); + return status; + } + + /* Set NT & NTRHS */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, + A, NB, NB, LDA, N, N, N, sequence, &request ); + chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, + B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); + + /* Allocate workspace for partial pivoting */ + wsA = CHAMELEON_zgetrf_WS_Alloc( &descAt ); + wsB = CHAMELEON_zgetrf_WS_Alloc( &descBt ); + + if ( ( wsA->alg == ChamGetrfPPivPerColumn ) || + ( wsA->alg == ChamGetrfPPiv ) ) + { + chameleon_ipiv_init( &descIPIV, &descAt, IPIV ); + } + + /* Call the tile interface */ + CHAMELEON_zgesv_Tile_Async( &descAt, &descIPIV, &descBt, wsA, wsB, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInout, ChamUpperLower, sequence, &request ); + chameleon_ztile2lap( chamctxt, &descBl, &descBt, + ChamDescInout, ChamUpperLower, sequence, &request ); + + if ( ( wsA->alg == ChamGetrfPPivPerColumn ) || + ( wsA->alg == ChamGetrfPPiv ) ) + { + RUNTIME_ipiv_gather( sequence, &descIPIV, IPIV, 0 ); + } + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + if ( ( wsA->alg == ChamGetrfPPivPerColumn ) || + ( wsA->alg == ChamGetrfPPiv ) ) + { + chameleon_ipiv_destroy( &descIPIV, &descAt ); + } + + /* Cleanup the temporary data */ + CHAMELEON_zgetrf_WS_Free( wsA ); + CHAMELEON_zgetrf_WS_Free( wsB ); + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Solves a system of linear equations using the tile LU factorization. + * Tile equivalent of CHAMELEON_zgetrf_nopiv(). + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in,out] A + * On entry, the N-by-N coefficient matrix A. + * On exit, the tile L and U factors from the factorization (not equivalent to LAPACK). + * + * @param[in,out] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, if return value = 0, the N-by-NRHS solution matrix X. + * + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval >0 if i, U(i,i) is exactly zero. The factorization has been completed, + * but the factor U is exactly singular, so the solution could not be computed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgesv + * @sa CHAMELEON_zgesv_Tile_Async + * @sa CHAMELEON_cgesv_Tile + * @sa CHAMELEON_dgesv_Tile + * @sa CHAMELEON_sgesv_Tile + * @sa CHAMELEON_zcgesv_Tile + * + */ +int CHAMELEON_zgesv_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV, CHAM_desc_t *B ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + void *wsA, *wsB; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error( "CHAMELEON_zgesv_Tile", "CHAMELEON not initialized" ); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + wsA = CHAMELEON_zgetrf_WS_Alloc( A ); + wsB = CHAMELEON_zgetrf_WS_Alloc( B ); + CHAMELEON_zgesv_Tile_Async( A, IPIV, B, wsA, wsB, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Ipiv_Flush( IPIV, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zgetrf_WS_Free( wsA ); + CHAMELEON_zgetrf_WS_Free( wsB ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Solves a system of linear equations using the tile LU factorization. + * + * Non-blocking equivalent of CHAMELEON_zgesv_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in,out] A + * On entry, the M-by-N matrix to be factored. + * On exit, the tile factors L and U from the factorization. + * + * @param[in,out] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, the N-by-NRHS solution matrix X. + * + * @param[in,out] user_wsA + * The opaque pointer to pre-allocated getrf workspace through + * CHAMELEON_zgetrf_WS_Alloc() for A. If user_ws is NULL, it is automatically + * allocated, but BE CAREFULL as it switches the call from asynchronous + * to synchronous call. + * + * @param[in,out] user_wsB + * The opaque pointer to pre-allocated getrf workspace through + * CHAMELEON_zgetrf_WS_Alloc() for B. If user_ws is NULL, it is automatically + * allocated, but BE CAREFULL as it switches the call from asynchronous + * to synchronous call.* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgesv + * @sa CHAMELEON_zgesv_Tile + * @sa CHAMELEON_cgesv_Tile_Async + * @sa CHAMELEON_dgesv_Tile_Async + * @sa CHAMELEON_sgesv_Tile_Async + * @sa CHAMELEON_zcgesv_Tile_Async + * + */ +int CHAMELEON_zgesv_Tile_Async( CHAM_desc_t *A, + CHAM_ipiv_t *IPIV, + CHAM_desc_t *B, + void *user_wsA, + void *user_wsB, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgetrf_s *wsA, *wsB; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error( "CHAMELEON_zgesv_Tile", "CHAMELEON not initialized" ); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( sequence == NULL ) { + chameleon_fatal_error( "CHAMELEON_zgesv_Tile", "NULL sequence" ); + return CHAMELEON_ERR_UNALLOCATED; + } + if ( request == NULL ) { + chameleon_fatal_error( "CHAMELEON_zgesv_Tile", "NULL request" ); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if ( sequence->status == CHAMELEON_SUCCESS ) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED ); + } + + /* Check descriptors for correctness */ + if ( chameleon_desc_check( A ) != CHAMELEON_SUCCESS ) { + chameleon_error( "CHAMELEON_zgesv_Tile", "invalid first descriptor" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + if ( chameleon_desc_check( B ) != CHAMELEON_SUCCESS ) { + chameleon_error( "CHAMELEON_zgesv_Tile", "invalid third descriptor" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + /* Check input arguments */ + if ( A->nb != A->mb || B->nb != B->mb ) { + chameleon_error( "CHAMELEON_zgesv_Tile", "only square tiles supported" ); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + + if ( user_wsA == NULL ) { + wsA = CHAMELEON_zgetrf_WS_Alloc( A ); + } + else { + wsA = user_wsA; + } + + if ( user_wsB == NULL ) { + wsB = CHAMELEON_zgetrf_WS_Alloc( B ); + } + else { + wsB = user_wsB; + } + + chameleon_pzgetrf( wsA, A, IPIV, sequence, request ); + + CHAMELEON_zgetrs_Tile_Async( ChamNoTrans, A, IPIV, B, wsB, sequence, request ); + + if ( user_wsA == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zgetrf_WS_Free( wsA ); + } + if ( user_wsB == NULL ) { + CHAMELEON_zgetrf_WS_Free( wsB ); + } + return CHAMELEON_SUCCESS; +} + diff --git a/compute/zgetrs.c b/compute/zgetrs.c new file mode 100644 index 0000000000000000000000000000000000000000..9a2e5bac6de8623a1425cd81f18e4963b3ad92b2 --- /dev/null +++ b/compute/zgetrs.c @@ -0,0 +1,399 @@ +/** + * + * @file zgetrs.c + * + * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgetrs wrappers + * + * @version 1.3.0 + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Solves a system of linear equations A * X = B, with a general N-by-N matrix A + * using the tile LU factorization with partial pivoting computed by CHAMELEON_zgetrf. + * + ******************************************************************************* + * + * @param[in] trans + * Intended to specify the the form of the system of equations: + * = ChamNoTrans: A * X = B (No transpose) + * = ChamTrans: A^T * X = B (Transpose) + * = ChamConjTrans: A^H * X = B (Conjugate transpose) + * Only ChamNoTrans and ChamTrans are supported. + * + * @param[in] N + * The order of the matrix A. N >= 0. + * + * @param[in] NRHS + * The number of right hand sides, i.e., the number of columns of the matrix B. + * NRHS >= 0. + * + * @param[in] A + * The tile factors L and U from the factorization, computed by CHAMELEON_zgetrf. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + * @param[in] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, the solution matrix X. + * + * @param[in] LDB + * The leading dimension of the array B. LDB >= max(1,N). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @return <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************* + * + * @sa CHAMELEON_zgetrs_Tile + * @sa CHAMELEON_zgetrs_Tile_Async + * @sa CHAMELEON_cgetrs + * @sa CHAMELEON_dgetrs + * @sa CHAMELEON_sgetrs + * @sa CHAMELEON_zgetrf + * + */ +int CHAMELEON_zgetrs( cham_trans_t trans, int N, int NRHS, + CHAMELEON_Complex64_t *A, int LDA, + int *IPIV, + CHAMELEON_Complex64_t *B, int LDB ) +{ + int NB; + int status; + CHAM_context_t *chamctxt; + CHAM_ipiv_t *descIPIV; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + CHAM_desc_t descBl, descBt; + struct chameleon_pzgetrf_s *ws; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zgetrs", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if ( ( trans != ChamTrans ) && ( trans != ChamNoTrans ) ) { + chameleon_error("CHAMELEON_zgetrs", "Only ChamTrans and ChamNoTrans are supported"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( N < 0 ) { + chameleon_error("CHAMELEON_zgetrs", "illegal value of N"); + return -2; + } + if ( NRHS < 0 ) { + chameleon_error("CHAMELEON_zgetrs", "illegal value of NRHS"); + return -3; + } + if ( LDA < chameleon_max( 1, N ) ) { + chameleon_error("CHAMELEON_zgetrs", "illegal value of LDA"); + return -5; + } + if ( LDB < chameleon_max( 1, N ) ) { + chameleon_error("CHAMELEON_zgetrs", "illegal value of LDB"); + return -9; + } + /* Quick return */ + if ( chameleon_min( N, NRHS ) == 0 ) + return CHAMELEON_SUCCESS; + + /* Tune NB & IB depending on N & NRHS; Set NBNBSIZE */ + status = chameleon_tune( CHAMELEON_FUNC_ZGESV, N, N, NRHS ); + if ( status != CHAMELEON_SUCCESS ) { + chameleon_error("CHAMELEON_zgetrs", "chameleon_tune() failed"); + return status; + } + + /* Set NT & NTRHS */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInput, ChamUpperLower, + A, NB, NB, LDA, N, N, N, sequence, &request ); + chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, + B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); + + ws = CHAMELEON_zgetrf_WS_Alloc( &descBt ); + CHAMELEON_Ipiv_Create( &descIPIV, &descAt, IPIV ); + CHAMELEON_Ipiv_Init( &descAt, descIPIV ); + + /* Call the tile interface */ + CHAMELEON_zgetrs_Tile_Async( trans, &descAt, descIPIV, &descBt, ws, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInput, ChamUpperLower, sequence, &request ); + chameleon_ztile2lap( chamctxt, &descBl, &descBt, + ChamDescInout, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + CHAMELEON_Ipiv_Destroy( &descIPIV, &descAt ); + CHAMELEON_zgetrf_WS_Free( ws ); + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Solves a system of linear equations using previously + * computed LU factorization with partial pivoting. + * Tile equivalent of CHAMELEON_zgetrs(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] trans + * Intended to specify the the form of the system of equations: + * = ChamNoTrans: A * X = B (No transpose) + * = ChamTrans: A^T * X = B (Transpose) + * = ChamConjTrans: A^H * X = B (Conjugate transpose) + * Only ChamNoTrans and ChamTrans are supported. + * + * @param[in] A + * The tile factors L and U from the factorization, computed by CHAMELEON_zgetrf. + * + * @param[in] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, the solution matrix X. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgetrs + * @sa CHAMELEON_zgetrs_Tile_Async + * @sa CHAMELEON_cgetrs_Tile + * @sa CHAMELEON_dgetrs_Tile + * @sa CHAMELEON_sgetrs_Tile + * @sa CHAMELEON_zgetrf_Tile + * + */ +int CHAMELEON_zgetrs_Tile( cham_trans_t trans, + CHAM_desc_t *A, + CHAM_ipiv_t *IPIV, + CHAM_desc_t *B ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + void *ws; + + chamctxt = chameleon_context_self(); + if ( ( trans != ChamTrans ) && ( trans != ChamNoTrans ) ) { + chameleon_error("CHAMELEON_zgetrs", "Only ChamTrans and ChamNoTrans are supported"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zgetrs_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + ws = CHAMELEON_zgetrf_WS_Alloc( B ); + + CHAMELEON_zgetrs_Tile_Async( trans, A, IPIV, B, ws, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + + CHAMELEON_zgetrf_WS_Free( ws ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Solves a system of linear equations using previously + * computed LU factorization with partial pivoting. + * Non-blocking equivalent of CHAMELEON_zgetrs_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] trans + * Intended to specify the the form of the system of equations: + * = ChamNoTrans: A * X = B (No transpose) + * = ChamTrans: A^T * X = B (Transpose) + * = ChamConjTrans: A^H * X = B (Conjugate transpose) + * Only ChamNoTrans and ChamTrans are supported. + * + * @param[in,out] A + * On entry, the M-by-N matrix to be factored. + * On exit, the tile factors L and U from the factorization. + * + * @param[in] IPIV + * On entry, ipiv descriptor associated to A and created with + * CHAMELEON_Ipiv_Create(). + * On exit, it contains the pivot indices associated to the PLU + * factorization of A. + * + * @param[in,out] B + * On entry, the N-by-NRHS matrix of right hand side matrix B. + * On exit, the N-by-NRHS solution matrix X. + * + * @param[in,out] user_ws + * The opaque pointer to pre-allocated getrf workspace through + * CHAMELEON_zgetrf_WS_Alloc() for B. If user_ws is NULL, it is automatically + * allocated, but BE CAREFULL as it switches the call from asynchronous + * to synchronous call.* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgetrs + * @sa CHAMELEON_zgetrs_Tile + * @sa CHAMELEON_cgetrs_Tile_Async + * @sa CHAMELEON_dgetrs_Tile_Async + * @sa CHAMELEON_sgetrs_Tile_Async + * @sa CHAMELEON_zgetrf_Tile_Async + * + */ +int CHAMELEON_zgetrs_Tile_Async( cham_trans_t trans, + CHAM_desc_t *A, + CHAM_ipiv_t *IPIV, + CHAM_desc_t *B, + void *user_ws, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgetrf_s *ws; + RUNTIME_option_t options; + int k, tempkm; + + chamctxt = chameleon_context_self(); + if ( ( trans != ChamTrans ) && ( trans != ChamNoTrans ) ) { + chameleon_error("CHAMELEON_zgetrs", "Only ChamTrans and ChamNoTrans are supported"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zgetrs_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( sequence == NULL ) { + chameleon_fatal_error("CHAMELEON_zgetrs_Tile", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if ( request == NULL ) { + chameleon_fatal_error("CHAMELEON_zgetrs_Tile", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if ( sequence->status == CHAMELEON_SUCCESS ) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED ); + } + + /* Check descriptors for correctness */ + if ( chameleon_desc_check( A ) != CHAMELEON_SUCCESS ) { + chameleon_error("CHAMELEON_zgetrs_Tile", "invalid first descriptor"); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + if ( chameleon_desc_check( B ) != CHAMELEON_SUCCESS ) { + chameleon_error("CHAMELEON_zgetrs_Tile", "invalid third descriptor"); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + /* Check input arguments */ + if ( ( A->nb != A->mb ) || ( B->nb != B->mb ) ) { + chameleon_error("CHAMELEON_zgetrs_Tile", "only square tiles supported"); + return chameleon_request_fail( sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE ); + } + + if ( user_ws == NULL ) { + ws = CHAMELEON_zgetrf_WS_Alloc( B ); + } + else { + ws = user_ws; + } + + if ( IPIV->data != NULL ) { + RUNTIME_options_init( &options, chamctxt, sequence, request ); + for ( k = 0; k < A->mt; k++ ) { + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + INSERT_TASK_ipiv_to_perm( &options, k * A->mb, tempkm, tempkm, 0, A->m, + IPIV, k ); + } + chameleon_sequence_wait( chamctxt, sequence ); + } + + if ( trans == ChamNoTrans ) { + chameleon_pzlaswp( ws, ChamDirForward, B, IPIV, sequence, request ); + + chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamUnit, (CHAMELEON_Complex64_t)1.0, A, B, sequence, request ); + + chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, (CHAMELEON_Complex64_t)1.0, A, B, sequence, request ); + } + else { + chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, (CHAMELEON_Complex64_t)1.0, A, B, sequence, request ); + + chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamUnit, (CHAMELEON_Complex64_t)1.0, A, B, sequence, request ); + + chameleon_pzlaswp( ws, ChamDirBackward, B, IPIV, sequence, request ); + } + + if ( user_ws == NULL ) { + CHAMELEON_zgetrf_WS_Free( ws ); + } + + return CHAMELEON_SUCCESS; +} diff --git a/compute/zlaswp.c b/compute/zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..6d7955e78d8ce52e3ff423df2182c29b7005f9c4 --- /dev/null +++ b/compute/zlaswp.c @@ -0,0 +1,388 @@ +/** + * + * @file zlaswp.c + * + * @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zlaswp wrappers + * + * @version 1.3.0 + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Computes the permutation P*op(A) or op(A)*P where P is the permutation + * matrix generated from IPIV. + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the permutation is done on the rows or the columns. + * = ChamLeft: op(A) = A + * = ChamRight: op(A) = A^T + * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order. P*op(A) + * = ChamDirBackward: Reverse order. op(A)*P + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. + * + * @param[in,out] A + * The M-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + * @param[in] K1 + * The first element of IPIV for which an interchange will + * be done. Must follow the Fortran numbering standard. + * + * @param[in] K2 + * The last element of ipiv for which an interchange will + * be done. Must follow the Fortran numbering standard. + * + * @param[in] IPIV + * Vector of pivot indices. + * + ******************************************************************************* + * + * @sa CHAMELEON_zlaswp_Tile + * @sa CHAMELEON_zlaswp_Tile_Async + * @sa CHAMELEON_claswp + * @sa CHAMELEON_dlaswp + * @sa CHAMELEON_slaswp + * + */ +int CHAMELEON_zlaswp( cham_side_t side, + cham_dir_t dir, + int M, + int N, + CHAMELEON_Complex64_t *A, + int LDA, + int K1, + int K2, + int *IPIV ) +{ + int status; + int NB; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + CHAM_ipiv_t *descIPIV; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zlaswp", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( side == ChamRight ) { + chameleon_fatal_error("CHAMELEON_zlaswp", "Only ChamLeft is implemented"); + return CHAMELEON_ERR_NOT_SUPPORTED; + } + /* Check input arguments */ + if ( M < 0 ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of M"); + return -2; + } + if ( N < 0 ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of N"); + return -3; + } + if ( LDA < chameleon_max( 1, M ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of LDA"); + return -5; + } + if ( ( K1 < 1 ) || ( K1 > M ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K1"); + return -6; + } + if ( ( K2 < 1 ) || ( K2 > M ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K2"); + return -7; + } + + /* Quick return */ + if ( chameleon_min( N, M ) == 0 ) { + return (double)0.0; + } + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_ZGEMM, M, N, 0); + if ( status != CHAMELEON_SUCCESS ) { + chameleon_error("CHAMELEON_zlaswp", "chameleon_tune() failed"); + return status; + } + + /* Set NB */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInput, ChamUpperLower, + A, NB, NB, LDA, N, M, N, sequence, &request ); + CHAMELEON_Ipiv_Create( &descIPIV, &descAt, IPIV ); + + CHAMELEON_Ipiv_Init( &descAt, descIPIV ); + + /* Call the tile interface */ + CHAMELEON_zlaswp_Tile_Async( side, dir, &descAt, K1, K2, descIPIV, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInput, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + CHAMELEON_Ipiv_Destroy( &descIPIV, &descAt ); + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + chameleon_sequence_destroy( chamctxt, sequence ); + return CHAMELEON_SUCCESS; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Tile equivalent of CHAMELEON_zlaswp(). + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the permutation is done on the rows or the columns. + * = ChamLeft: op(A) = A + * = ChamRight: op(A) = A^T + * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order. P*op(A) + * = ChamDirBackward: Reverse order. op(A)*P + * + * @param[in,out] A + * The M-by-N matrix A. + * + * @param[in] K1 + * The first element of IPIV for which an interchange will + * be done. Must follow the Fortran numbering standard. + * + * @param[in] K2 + * The last element of ipiv for which an interchange will + * be done. Must follow the Fortran numbering standard. + * + * @param[in] IPIV + * Vector of pivot indices. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zlaswp + * @sa CHAMELEON_zlaswp_Tile_Async + * @sa CHAMELEON_claswp_Tile + * @sa CHAMELEON_dlaswp_Tile + * @sa CHAMELEON_slaswp_Tile + * + */ +int CHAMELEON_zlaswp_Tile( cham_side_t side, + cham_dir_t dir, + CHAM_desc_t *A, + int K1, + int K2, + CHAM_ipiv_t *IPIV ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zlaswp_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( side == ChamRight ) { + chameleon_fatal_error("CHAMELEON_zlaswp", "Only ChamLeft is implemented"); + return CHAMELEON_ERR_NOT_SUPPORTED; + } + if ( ( K1 < 1 ) || ( K1 > A->m ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K1"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( ( K2 < 1 ) || ( K2 > A->m ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K2"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zlaswp_Tile_Async( side, dir, A, K1, K2, IPIV, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Ipiv_Flush( IPIV, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Non-blocking equivalent of CHAMELEON_zlaswp_Tile(). + * + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the permutation is done on the rows or the columns. + * = ChamLeft: op(A) = A + * = ChamRight: op(A) = A^T + * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order. P*op(A) + * = ChamDirBackward: Reverse order. op(A)*P + * + * @param[in,out] A + * The M-by-N matrix A. + * + * @param[in] K1 + * The first element of IPIV for which an interchange will + * be done. Must follow the Fortran numbering standard + * + * @param[in] K2 + * The last element of ipiv for which an interchange will + * be done. Must follow the Fortran numbering standard. + * + * @param[in] IPIV + * Vector of pivot indices. + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zlaswp + * @sa CHAMELEON_zlaswp_Tile + * @sa CHAMELEON_claswp_Tile_Async + * @sa CHAMELEON_dlaswp_Tile_Async + * @sa CHAMELEON_slaswp_Tile_Async + * + */ +int CHAMELEON_zlaswp_Tile_Async( cham_side_t side, + cham_dir_t dir, + CHAM_desc_t *A, + int K1, + int K2, + CHAM_ipiv_t *IPIV, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgetrf_s *ws; + RUNTIME_option_t options; + int k, tempkm; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + chameleon_fatal_error("CHAMELEON_zlaswp_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( side == ChamRight ) { + chameleon_fatal_error("CHAMELEON_zlaswp", "Only ChamLeft is implemented"); + return CHAMELEON_ERR_NOT_SUPPORTED; + } + if ( ( K1 < 1 ) || ( K1 > A->m ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K1"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( ( K2 < 1 ) || ( K2 > A->m ) ) { + chameleon_error("CHAMELEON_zlaswp", "illegal value of K2"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + if ( sequence == NULL ) { + chameleon_fatal_error("CHAMELEON_zlaswp_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if ( request == NULL ) { + chameleon_fatal_error("CHAMELEON_zlaswp_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if ( sequence->status == CHAMELEON_SUCCESS ) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if ( chameleon_desc_check(A) != CHAMELEON_SUCCESS ) { + chameleon_error("CHAMELEON_zlaswp_Tile_Async", "invalid first descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if ( A->mb != A->nb ) { + chameleon_error("CHAMELEON_zlaswp_Tile_Async", "only matching tile sizes supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* Quick return */ + if ( chameleon_min( A->m, A->n ) == 0 ) { + return CHAMELEON_SUCCESS; + } + + if ( IPIV->data != NULL ) { + RUNTIME_options_init( &options, chamctxt, sequence, request ); + for ( k = 0; k < A->mt; k++ ) { + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + INSERT_TASK_ipiv_to_perm( &options, k * A->mb, tempkm, tempkm, K1 - 1, K2 - 1, + IPIV, k ); + } + chameleon_sequence_wait( chamctxt, sequence ); + } + + ws = CHAMELEON_zgetrf_WS_Alloc( A ); + + chameleon_pzlaswp( ws, dir, A, IPIV, sequence, request ); + + CHAMELEON_zgetrf_WS_Free( ws ); + + return CHAMELEON_SUCCESS; +} + diff --git a/control/compute_z.h b/control/compute_z.h index 812af3dce918e74926506810d38b2db8fc167e33..72f4504a4544e7098fa2335ea15deb4720dbc635 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -24,7 +24,7 @@ * @author Lionel Eyraud-Dubois * @author Ana Hourcau * @author Pierre Esterie - * @date 2024-12-09 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -172,7 +172,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzlaset( cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzlaset2(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzlaswp(CHAM_desc_t *B, int *IPIV, int inc, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzlaswp( struct chameleon_pzgetrf_s *ws, cham_dir_t dir, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzlaswpc(CHAM_desc_t *B, int *IPIV, int inc, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzlatms( cham_dist_t idist, unsigned long long int seed, cham_sym_t sym, double *D, int mode, double cond, double dmax, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/control/descriptor_ipiv.c b/control/descriptor_ipiv.c index 84067cf5f987c425c7e5f1b989844fe7e1df7c64..d46269d32c670223b595842ff60569b65573e5b5 100644 --- a/control/descriptor_ipiv.c +++ b/control/descriptor_ipiv.c @@ -14,7 +14,7 @@ * @author Matthieu Kuhn * @author Alycia Lisito * @author Florent Pruvost - * @date 2024-08-29 + * @date 2025-03-24 * *** * @@ -148,6 +148,45 @@ int CHAMELEON_Ipiv_Create( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void return CHAMELEON_SUCCESS; } +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief initialize the IPIV descriptor. + * + ******************************************************************************* + * + * @param[in] descA + * Descriptor of the matrix A. + * + * @param[in,out] descIPIV + * Descriptor of the pivot array. Should be initialized using + * CHAMELEON_Ipiv_Create() with data filled with the vector of pivot. + * + ******************************************************************************* + * + * + */ +void CHAMELEON_Ipiv_Init( const CHAM_desc_t *descA, + CHAM_ipiv_t *descIPIV ) +{ + + RUNTIME_option_t options; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + RUNTIME_sequence_t *sequence = NULL; + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + chameleon_sequence_create( chamctxt, &sequence ); + RUNTIME_options_init( &options, chamctxt, sequence, &request ); + + INSERT_TASK_ipiv_init_data( &options, descIPIV ); + + chameleon_sequence_wait( chamctxt, sequence ); + chameleon_sequence_destroy( chamctxt, sequence ); +} + /** ***************************************************************************** * diff --git a/coreblas/compute/core_ipiv_to_perm.c b/coreblas/compute/core_ipiv_to_perm.c index 6c19272b3eaec960eb457fc40e9ba77f06b24075..9b2b53ceedec775158a9e5ce192a7abdd7120896 100644 --- a/coreblas/compute/core_ipiv_to_perm.c +++ b/coreblas/compute/core_ipiv_to_perm.c @@ -11,7 +11,8 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @author Matteo Marcos + * @date 2025-03-24 */ #include "coreblas.h" @@ -44,6 +45,14 @@ * @param[in] k * The number of elements in ipiv. k >= 0. * + * @param[in] K1 + * The first element of IPIV for which an interchange will + * be done. + * + * @param[in] K2 + * The last element of ipiv for which an interchange will + * be done. + * * @param[in] ipiv * The pivot array of size n. This is a (m0+1)-based indices array to follow * the Fortran standard. @@ -55,7 +64,7 @@ * The permutation array of the origin row indices (m0-based) of the [1,n] set of rows. * */ -void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp ) +void CORE_ipiv_to_perm( int m0, int m, int k, int K1, int K2, int *ipiv, int *perm, int *invp ) { int i, j, ip; int i_1, ip_1; @@ -66,6 +75,9 @@ void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp ) } for(i = 0; i < k; i++) { + if ( ( i + m0 < K1 ) || ( i + m0 > K2 ) ) { + continue; + } ip = ipiv[i]-1; assert( ip - m0 >= i ); diff --git a/coreblas/include/coreblas.h b/coreblas/include/coreblas.h index c72530c108bf89bae0a3456b7365d7e43605deec..623dcd541b974e25477560fa03dfbfb2dedfcb8e 100644 --- a/coreblas/include/coreblas.h +++ b/coreblas/include/coreblas.h @@ -18,7 +18,8 @@ * @author Guillaume Sylvand * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2024-03-14 + * @author Matteo Marcos + * @date 2025-03-24 * */ #ifndef _coreblas_h_ @@ -94,7 +95,7 @@ void __coreblas_kernel_trace( const char *func, ... ); #endif -void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp ); +void CORE_ipiv_to_perm( int m0, int m, int k, int K1, int K2, int *ipiv, int *perm, int *invp ); END_C_DECLS diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index fcd8177a0a0cc56a6e21080463956c98918d1a46..d9540d5e8472ed15af9ead33296f9ac9cd76d6ab 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -24,7 +24,8 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Ana Hourcau - * @date 2024-10-17 + * @author Matteo Marcos + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -48,7 +49,7 @@ int CHAMELEON_zgemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int int CHAMELEON_zgepdf_qdwh( int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *H, int LDH, gepdf_info_t *info ); int CHAMELEON_zgeqrf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT); int CHAMELEON_zgeqrs(int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); -//int CHAMELEON_zgesv(int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); +int CHAMELEON_zgesv(int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgesv_incpiv(int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descL, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgesv_nopiv(int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgesvd(cham_job_t jobu, cham_job_t jobvt, int M, int N, CHAMELEON_Complex64_t *A, int LDA, double *S, CHAM_desc_t *descT, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT); @@ -57,7 +58,7 @@ int CHAMELEON_zgetrf_incpiv(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHA int CHAMELEON_zgetrf_nopiv(int M, int N, CHAMELEON_Complex64_t *A, int LDA); int CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ); //int CHAMELEON_zgetri(int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV); -//int CHAMELEON_zgetrs(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); +int CHAMELEON_zgetrs(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgetrs_incpiv(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descL, int *IPIV, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgetrs_nopiv(cham_trans_t trans, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zhemm(cham_side_t side, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC); @@ -76,8 +77,7 @@ double CHAMELEON_zlansy(cham_normtype_t norm, cham_uplo_t uplo, int N, CHAMELEON double CHAMELEON_zlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, CHAMELEON_Complex64_t *A, int LDA); int CHAMELEON_zlascal(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA); int CHAMELEON_zlaset(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *A, int LDA); -//int CHAMELEON_zlaswp(int N, CHAMELEON_Complex64_t *A, int LDA, int K1, int K2, int *IPIV, int INCX); -//int CHAMELEON_zlaswpc(int N, CHAMELEON_Complex64_t *A, int LDA, int K1, int K2, int *IPIV, int INCX); +int CHAMELEON_zlaswp( cham_side_t side, cham_dir_t dir, int M, int N, CHAMELEON_Complex64_t *A, int LDA, int K1, int K2, int *IPIV ); int CHAMELEON_zlatms( int M, int N, cham_dist_t idist, unsigned long long int seed, cham_sym_t sym, double *D, int mode, double cond, double dmax, CHAMELEON_Complex64_t *A, int LDA ); int CHAMELEON_zlauum(cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA); int CHAMELEON_zplghe( double bump, cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA, unsigned long long int seed ); @@ -129,7 +129,7 @@ int CHAMELEON_zgepdf_qdwh_Tile( CHAM_desc_t *A, CHAM_desc_t *H, gepdf_info_t *in int CHAMELEON_zgepdf_qr_Tile( int doqr, int optid, const libhqr_tree_t *qrtreeT, const libhqr_tree_t *qrtreeB, CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *Q1, CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *Q2 ); int CHAMELEON_zgeqrf_Tile(CHAM_desc_t *A, CHAM_desc_t *T); int CHAMELEON_zgeqrs_Tile(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); -//int CHAMELEON_zgesv_Tile(CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B); +int CHAMELEON_zgesv_Tile(CHAM_desc_t *A, CHAM_ipiv_t *IPIV, CHAM_desc_t *B); int CHAMELEON_zgesv_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B); int CHAMELEON_zgesv_nopiv_Tile(CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zgesvd_Tile(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, double *S, CHAM_desc_t *T, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT); @@ -138,7 +138,7 @@ int CHAMELEON_zgetrf_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV); int CHAMELEON_zgetrf_nopiv_Tile(CHAM_desc_t *A); int CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV ); //int CHAMELEON_zgetri_Tile(CHAM_desc_t *A, int *IPIV); -//int CHAMELEON_zgetrs_Tile(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B); +int CHAMELEON_zgetrs_Tile(cham_trans_t trans, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, CHAM_desc_t *B); int CHAMELEON_zgetrs_incpiv_Tile(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B); int CHAMELEON_zgetrs_nopiv_Tile(CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zhemm_Tile(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C); @@ -157,8 +157,7 @@ double CHAMELEON_zlansy_Tile(cham_normtype_t norm, cham_uplo_t uplo, CHAM_desc_t double CHAMELEON_zlantr_Tile(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A); int CHAMELEON_zlascal_Tile(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A); int CHAMELEON_zlaset_Tile(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_desc_t *A); -//int CHAMELEON_zlaswp_Tile(CHAM_desc_t *A, int K1, int K2, int *IPIV, int INCX); -//int CHAMELEON_zlaswpc_Tile(CHAM_desc_t *A, int K1, int K2, int *IPIV, int INCX); +int CHAMELEON_zlaswp_Tile( cham_side_t side, cham_dir_t dir, CHAM_desc_t *A, int K1, int K2, CHAM_ipiv_t *IPIV ); int CHAMELEON_zlatms_Tile( cham_dist_t idist, unsigned long long int seed, cham_sym_t sym, double *D, int mode, double cond, double dmax, CHAM_desc_t *A ); int CHAMELEON_zlauum_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zplghe_Tile(double bump, cham_uplo_t uplo, CHAM_desc_t *A, unsigned long long int seed ); @@ -209,7 +208,7 @@ int CHAMELEON_zgemm_Tile_Async(cham_trans_t transA, cham_trans_t transB, CHAMELE int CHAMELEON_zgepdf_qdwh_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *H, gepdf_info_t *info, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zgeqrf_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgeqrs_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -//int CHAMELEON_zgesv_Tile_Async(CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zgesv_Tile_Async(CHAM_desc_t *A, CHAM_ipiv_t *IPIV, CHAM_desc_t *B, void *user_wsA, void *user_wsB, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgesv_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgesv_nopiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *B, void * ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgesvd_Tile_Async(cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, double *S, CHAM_desc_t *T, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); @@ -218,7 +217,7 @@ int CHAMELEON_zgetrf_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, void * ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrf_Tile_Async( CHAM_desc_t *A, CHAM_ipiv_t *IPIV, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); //int CHAMELEON_zgetri_Tile_Async(CHAM_desc_t *A, int *IPIV, CHAM_desc_t *W, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -//int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, CHAM_desc_t *B, void *user_ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_nopiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); @@ -237,8 +236,7 @@ int CHAMELEON_zlansy_Tile_Async(cham_normtype_t norm, cham_uplo_t uplo, CHAM_des int CHAMELEON_zlantr_Tile_Async(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, double *value, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zlascal_Tile_Async(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zlaset_Tile_Async(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -//int CHAMELEON_zlaswp_Tile_Async(CHAM_desc_t *A, int K1, int K2, int *IPIV, int INCX, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -//int CHAMELEON_zlaswpc_Tile_Async(CHAM_desc_t *A, int K1, int K2, int *IPIV, int INCX, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zlaswp_Tile_Async( cham_side_t side, cham_dir_t dir, CHAM_desc_t *A, int K1, int K2, CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zlatms_Tile_Async( cham_dist_t idist, unsigned long long int seed, cham_sym_t sym, double *D, int mode, double cond, double dmax, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zlauum_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zplghe_Tile_Async(double bump, cham_uplo_t uplo, CHAM_desc_t *A, unsigned long long int seed, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); @@ -371,6 +369,7 @@ int CHAMELEON_zLapack_to_Tile( CHAMELEON_Complex64_t *Af77, int LDA, CHAM_desc_t int CHAMELEON_zTile_to_Lapack( CHAM_desc_t *A, CHAMELEON_Complex64_t *Af77, int LDA ) __attribute__((deprecated("Please refer to CHAMELEON_zDesc2Lap() instead"))); int CHAMELEON_zLap2Desc( cham_uplo_t uplo, CHAMELEON_Complex64_t *Af77, int LDA, CHAM_desc_t *A ); int CHAMELEON_zDesc2Lap( cham_uplo_t uplo, CHAM_desc_t *A, CHAMELEON_Complex64_t *Af77, int LDA ); +void CHAMELEON_Ipiv_Init( const CHAM_desc_t *descA, CHAM_ipiv_t *descIPIV ); /** * User Builder function prototypes diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index 1e5e242b274612406036f7f63cd73e82365be8a7..b9cd9fcb4be875946d42537f3c75a1997b9a8826 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -17,7 +17,8 @@ * @author Florent Pruvost * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-09-06 + * @author Matteo Marcos + * @date 2025-03-24 * */ #ifndef _chameleon_tasks_h_ @@ -167,12 +168,14 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ); -void INSERT_TASK_ipiv_init ( const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv ); +void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ); +void INSERT_TASK_ipiv_init_data( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ); void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ws, int k, int h, int rank ); void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, - int m0, int m, int k, + int m0, int m, int k, int K1, int K2, const CHAM_ipiv_t *ipivdesc, int ipivk ); #include "chameleon/tasks_z.h" diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index f444409b8279007d43d810f6b603595c414f9819..90b4578d47eadeab248d9d47cb45a4a93b74a1b2 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -25,7 +25,7 @@ * @author Romain Peressoni * @author Matthieu Kuhn * @author Ana Hourcau - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -188,12 +188,12 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *tileA, int tileAm, int tileAn ); -void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *tIPIV, int tIPIVk, const CHAM_desc_t *tileA, int tileAm, int tileAn, const CHAM_desc_t *tileB, int tileBm, int tileBn ); -void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *tIPIV, int tIPIVk, const CHAM_desc_t *tileA, int tileAm, int tileAn, @@ -588,15 +588,20 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile - * U(Um, Un) according to the permutation ipiv. This task is used in the LU - * factorization with partial pivoting. + * @brief Perfoms an allreduce operation on the tile + * U(Um, Un) according to the permutation ipiv. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] A * The descriptor of the matrix A. * @@ -630,6 +635,7 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -645,9 +651,9 @@ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus - * involved in the permutation. This task is used in the LU factorization with - * partial pivoting. + * @brief Sends the tile A(Am, An) to the processus + * involved in the permutation. This task is used in the LU factorization with + * partial pivoting. * ******************************************************************************* * @@ -687,15 +693,20 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the - * processus involved in the permutation. This task is used in the LU - * factorization with partial pivoting. + * @brief - Sends the permutation ipivk to the + * processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -715,6 +726,7 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -726,15 +738,20 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk - * to the processus involved in the permutation. This task is used in the LU - * factorization with partial pivoting. + * @brief Sends the inverse permutation ipivk + * to the processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -754,6 +771,7 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -761,3 +779,4 @@ void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ); #endif /* _chameleon_tasks_z_h_ */ + diff --git a/runtime/openmp/codelets/codelet_ipiv.c b/runtime/openmp/codelets/codelet_ipiv.c index c21d13280ea0d316f5cc5d1799345ae0e8a4bbb8..ccc7e8f46ea496f30d00d81dd0f418ba07fdd175 100644 --- a/runtime/openmp/codelets/codelet_ipiv.c +++ b/runtime/openmp/codelets/codelet_ipiv.c @@ -13,7 +13,8 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-08-29 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "chameleon_openmp.h" @@ -28,6 +29,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, (void)ipiv; } +void INSERT_TASK_ipiv_init_data( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)ipiv; +} + void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h, int rank ) { @@ -40,7 +49,7 @@ void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, } void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, - int m0, int m, int k, + int m0, int m, int k, int K1, int K2, const CHAM_ipiv_t *ipivdesc, int ipivk ) { int *ipiv = NULL; // get pointer from ipivdesc @@ -49,9 +58,11 @@ void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, #pragma omp task firstprivate( m0, m, k ) depend( in:ipiv[0] ) depend( inout:perm[0] ) depend( inout:invp[0] ) { - CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); + CORE_ipiv_to_perm( m0, m, k, 1, m, ipiv, perm, invp ); } (void)options; + (void)K1; + (void)K2; (void)ipivk; } diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c index bce58c771ef3052ce4d20d16232082cd9a746f66..93bf20aef11964fa548adb7739b000af575b04ba 100644 --- a/runtime/openmp/codelets/codelet_zlaswp.c +++ b/runtime/openmp/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -20,7 +20,7 @@ #include "coreblas/coreblas_ztile.h" void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -38,10 +38,11 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, } (void)options; + (void)dir; } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -59,4 +60,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, } (void)options; + (void)dir; } diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c index eac34fdfd1f8a0814c277f7acb8a9b85cb594ec7..8b20a60fd43332dac7373edcb2de40ee552d050a 100644 --- a/runtime/openmp/codelets/codelet_zperm_allreduce.c +++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/parsec/codelets/codelet_ipiv.c b/runtime/parsec/codelets/codelet_ipiv.c index b6d582e5ac8514525665adebd27b9459a9076005..2145e00b3575d7de659f28422064616815acd22a 100644 --- a/runtime/parsec/codelets/codelet_ipiv.c +++ b/runtime/parsec/codelets/codelet_ipiv.c @@ -13,7 +13,8 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-08-29 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "chameleon_parsec.h" @@ -28,6 +29,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, (void)ipiv; } +void INSERT_TASK_ipiv_init_data( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)ipiv; +} + void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h, int rank ) { @@ -49,14 +58,14 @@ CORE_ipiv_to_perm_parsec( parsec_execution_stream_t *context, parsec_dtd_unpack_args( this_task, &m0, &m, &k, &ipiv, &perm, &invp ); - CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); + CORE_ipiv_to_perm( m0, m, k, 1, m, ipiv, perm, invp ); (void)context; return PARSEC_HOOK_RETURN_DONE; } void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, - int m0, int m, int k, + int m0, int m, int k, int K1, int K2, const CHAM_ipiv_t *ipivdesc, int ipivk ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); @@ -70,4 +79,7 @@ void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, PASSED_BY_REF, RUNTIME_perm_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_perm( ipivdesc ) | OUTPUT, PASSED_BY_REF, RUNTIME_invp_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_invp( ipivdesc ) | OUTPUT, PARSEC_DTD_ARG_END ); + + (void)K1; + (void)K2; } diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c index 12aaf7089ff41f4e4090e0fb6f18e518c9813fd3..65849c96d1aae96cc1000dd93e5efbebe481c7d9 100644 --- a/runtime/parsec/codelets/codelet_zlaswp.c +++ b/runtime/parsec/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -33,7 +33,7 @@ CORE_zlaswp_get_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -54,6 +54,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, sizeof(int), &(tileU->ld), VALUE, PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT, PARSEC_DTD_ARG_END ); + + (void)dir; } static inline int @@ -70,7 +72,7 @@ CORE_zlaswp_set_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -91,4 +93,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, sizeof(int), &(tileB->ld), VALUE, PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT, PARSEC_DTD_ARG_END ); + + (void)dir; } diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c index 9ceb440c8a4e677630a68355daa7defda7f904fa..f68148e24b5e4c4e7d42d7248c8bf4a9948477c6 100644 --- a/runtime/parsec/codelets/codelet_zperm_allreduce.c +++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/quark/codelets/codelet_ipiv.c b/runtime/quark/codelets/codelet_ipiv.c index 8075d0f8a43fc8fe2498cafdb39ae034483aa5d2..bf0846d3dfe9d6043162827a4d0a3eab9414caed 100644 --- a/runtime/quark/codelets/codelet_ipiv.c +++ b/runtime/quark/codelets/codelet_ipiv.c @@ -13,7 +13,8 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-08-29 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "chameleon_quark.h" @@ -28,6 +29,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, (void)ipiv; } +void INSERT_TASK_ipiv_init_data( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)ipiv; +} + void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h, int rank ) { @@ -47,11 +56,11 @@ CORE_ipiv_to_perm_quark( Quark *quark ) quark_unpack_args_6( quark, m0, m, k, ipiv, perm, invp ); - CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); + CORE_ipiv_to_perm( m0, m, k, 1, m, ipiv, perm, invp ); } void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, - int m0, int m, int k, + int m0, int m, int k, int K1, int K2, const CHAM_ipiv_t *ipivdesc, int ipivk ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -65,4 +74,7 @@ void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, sizeof(int*), RUNTIME_perm_getaddr( ipivdesc, ipivk ), OUTPUT, sizeof(int*), RUNTIME_invp_getaddr( ipivdesc, ipivk ), OUTPUT, 0 ); + + (void)K1; + (void)K2; } diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c index 176dd16916eb51e1b698ad0d17dbd0d37c1a1d61..8f5a1b57fd52bd2e401273171584ebcca1478e50 100644 --- a/runtime/quark/codelets/codelet_zlaswp.c +++ b/runtime/quark/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -30,7 +30,7 @@ static void CORE_zlaswp_get_quark( Quark *quark ) } void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -46,6 +46,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT, 0 ); + + (void)dir; } static void CORE_zlaswp_set_quark( Quark *quark ) @@ -59,7 +61,7 @@ static void CORE_zlaswp_set_quark( Quark *quark ) } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -75,4 +77,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT, 0 ); + + (void)dir; } diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c index f297d343b33455ba6340f0b81c45e8d01d29600f..1a2a7089c8addc5715d074a6c04bc5e8732aed1b 100644 --- a/runtime/quark/codelets/codelet_zperm_allreduce.c +++ b/runtime/quark/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/starpu/codelets/codelet_ipiv.c b/runtime/starpu/codelets/codelet_ipiv.c index 4498c63f3cbaba8655c740f98f7bdc4cc5fea974..5a16c6e2dda5d2e411415bf368f214bbbc8ec71b 100644 --- a/runtime/starpu/codelets/codelet_ipiv.c +++ b/runtime/starpu/codelets/codelet_ipiv.c @@ -13,21 +13,28 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-09-17 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "chameleon_starpu_internal.h" -#include "runtime_codelets.h" -static void cl_ipiv_init_cpu_func(void *descr[], void *cl_arg) +struct cl_laswp_args_s { + int m0; + int n; + int m; + int *data; +}; + +static void cl_ipiv_init_cpu_func( void *descr[], void *cl_arg ) { #if !defined(CHAMELEON_SIMULATION) - int *ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[0]); + int *ipiv = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); int i, m0, n; starpu_codelet_unpack_args( cl_arg, &m0, &n ); - for( i=0; i<n; i++ ) { + for( i = 0; i < n; i++ ) { ipiv[i] = m0 + i + 1; } #endif @@ -46,10 +53,10 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, int64_t mb = ipiv->mb; int m; - for (m = 0; m < mt; m++) { + for ( m = 0; m < mt; m++ ) { starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( ipiv, m ); int m0 = m * mb; - int n = (m == (mt-1)) ? ipiv->m - m0 : mb; + int n = ( m == ( mt - 1 ) ) ? ipiv->m - m0 : mb; rt_starpu_insert_task( &cl_ipiv_init, @@ -60,6 +67,62 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options, } } +static void cl_ipiv_init_data_cpu_func( void *descr[], void *cl_arg ) +{ +#if !defined(CHAMELEON_SIMULATION) + struct cl_laswp_args_s *clargs = (struct cl_laswp_args_s *) cl_arg; + + int *ipiv = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); + int n = clargs->n; + int i; + + for( i = 0; i < n; i++ ) { + ipiv[i] = clargs->data[i]; + } +#endif +} + +struct starpu_codelet cl_ipiv_init_data = { + .where = STARPU_CPU, + .cpu_func = cl_ipiv_init_data_cpu_func, + .nbuffers = 1, +}; + +void INSERT_TASK_ipiv_init_data( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv ) +{ + + int64_t mt = ipiv->mt; + int64_t mb = ipiv->mb; + int m; + + if ( ipiv->data == NULL ) { + return; + } + + for ( m = 0; m < mt; m++ ) { + starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( ipiv, m ); + struct cl_laswp_args_s *cl_args; + int m0, n; + + m0 = m * mb; + n = ( m == ( mt-1 ) ) ? ipiv->m - m0 : mb; + + cl_args = malloc( sizeof(struct cl_laswp_args_s) ); + cl_args->m0 = m0; + cl_args->n = n; + cl_args->m = ipiv->desc->m; + + cl_args->data = ipiv->data + m0; + + rt_starpu_insert_task( + &cl_ipiv_init_data, + STARPU_CL_ARGS, cl_args, sizeof(struct cl_laswp_args_s), + STARPU_W, ipiv_src, + 0); + } +} + void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h, int rank ) { @@ -67,7 +130,7 @@ void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, #if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI) #if !defined(HAVE_STARPU_MPI_REDUX_WRAPUP) - starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h ); + starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h ); if ( h < ipiv->n ) { starpu_mpi_redux_data_prio_tree( options->sequence->comm, nextpiv, options->priority, 2 /* Binary tree */ ); @@ -86,16 +149,16 @@ void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options, #if !defined(CHAMELEON_SIMULATION) static void cl_ipiv_to_perm_cpu_func( void *descr[], void *cl_arg ) { - int m0, m, k; + int m0, m, k, K1, K2; int *ipiv, *perm, *invp; - starpu_codelet_unpack_args( cl_arg, &m0, &m, &k ); + starpu_codelet_unpack_args( cl_arg, &m0, &m, &k, &K1, &K2 ); ipiv = (int*)STARPU_VECTOR_GET_PTR(descr[0]); perm = (int*)STARPU_VECTOR_GET_PTR(descr[1]); invp = (int*)STARPU_VECTOR_GET_PTR(descr[2]); - CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); + CORE_ipiv_to_perm( m0, m, k, K1, K2, ipiv, perm, invp ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -115,7 +178,7 @@ static struct starpu_codelet cl_ipiv_to_perm = { }; void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, - int m0, int m, int k, + int m0, int m, int k, int K1, int K2, const CHAM_ipiv_t *ipivdesc, int ipivk ) { struct starpu_codelet *codelet = &cl_ipiv_to_perm; @@ -125,6 +188,8 @@ void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &k, sizeof(int), + STARPU_VALUE, &K1, sizeof(int), + STARPU_VALUE, &K2, sizeof(int), STARPU_R, RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), STARPU_W, RUNTIME_perm_getaddr( ipivdesc, ipivk ), STARPU_W, RUNTIME_invp_getaddr( ipivdesc, ipivk ), @@ -132,3 +197,4 @@ void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, STARPU_EXECUTE_ON_WORKER, options->workerid, 0 ); } + diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c index 81c28d92f05d6c23e85e743b8402b79db31815b1..3829763abd896ca9db917a9d0573ac4d9b9b5255 100644 --- a/runtime/starpu/codelets/codelet_zlaswp.c +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -13,7 +13,7 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -48,11 +48,12 @@ CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) { + void *ipiv_handle; struct starpu_codelet *codelet = &cl_zlaswp_get; if ( A->get_rankof( A, Am, An) != A->myrank ) { return; @@ -63,12 +64,18 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, clargs->m0 = m0; clargs->k = k; + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL; rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), STARPU_PRIORITY, options->priority, @@ -80,18 +87,26 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, #else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle; if ( A->get_rankof( A, Am, An) != A->myrank ) { return; } + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_get, zlaswp_get, zlaswp, 3); /* @@ -99,8 +114,7 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, */ starpu_cham_exchange_init_params( options, ¶ms, U->get_rankof( U, Um, Un ) ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_perm_getaddr( ipiv, ipivk ), - STARPU_R ); + ipiv_handle, STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), STARPU_RW | STARPU_COMMUTE ); @@ -157,12 +171,14 @@ static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg ) CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { + void *ipiv_handle; struct starpu_codelet *codelet = &cl_zlaswp_set; if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { return; @@ -173,12 +189,19 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, clargs->m0 = m0; clargs->k = k; + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL; rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), - STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, @@ -186,20 +209,30 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, STARPU_EXECUTE_ON_WORKER, options->workerid, 0 ); } -#else + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle; if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { return; } + if( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_set, zlaswp_set, zlaswp, 3); /* @@ -207,8 +240,7 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_invp_getaddr( ipiv, ipivk ), - STARPU_R ); + ipiv_handle, STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( B, ChamComplexDouble, Bm, Bn ), STARPU_RW ); @@ -242,4 +274,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, } starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c index e32b7ad9c46a2303eb1c4c6a18d442935fca6d3a..a479056c5f9321b75cd89a99349fd1ef1c3f3976 100644 --- a/runtime/starpu/codelets/codelet_zperm_allreduce.c +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Alycia Lisito * @author Pierre Esterie - * @date 2024-11-14 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -21,6 +21,7 @@ #include <coreblas/cblas_wrapper.h> #if defined(CHAMELEON_USE_MPI) + struct cl_redux_args_s { int tempmm; int mb; @@ -91,6 +92,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, static void INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_desc_t *U, CHAM_ipiv_t *ipiv, int ipivk, @@ -105,6 +107,15 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, int p_first ) { struct cl_redux_args_s *clargs; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + clargs = malloc( sizeof( struct cl_redux_args_s ) ); clargs->tempmm = tempmm; clargs->mb = U->mb; @@ -121,7 +132,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_s), STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, me, n), STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_EXECUTE_ON_NODE, me, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_PRIORITY, options->priority, @@ -151,6 +162,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, static void INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_desc_t *U, CHAM_ipiv_t *ipiv, int ipivk, @@ -164,8 +176,16 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, int np, int p_first ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zperm_allreduce_send, zperm_allreduce, redux, 3 ); @@ -176,7 +196,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, src, n ), STARPU_R ); - starpu_cham_register_descr( &nbdata, descrs, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, ipiv_handle, STARPU_R ); task = starpu_task_create(); task->cl = cl; @@ -221,6 +241,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, static void zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -229,10 +250,10 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, int ipivk, int k, int n, - struct chameleon_pzgetrf_s *ws) + struct chameleon_pzgetrf_s *ws ) { int *proc_involved = ws->proc_involved; - int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); + int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k ); int np_iter = np_involved; int p_recv, p_send, me, p_first; int shift = 1; @@ -253,9 +274,11 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; INSERT_TASK_zperm_allreduce_send( options, U, A->myrank, p_send, n ); - INSERT_TASK_zperm_allreduce_recv( options, U, ipiv, ipivk, A->myrank, p_recv, + INSERT_TASK_zperm_allreduce_recv( options, dir, U, ipiv, ipivk, A->myrank, p_recv, n, k == (A->mt-1) ? A->m - k * A->mb : A->mb, - chameleon_desc_datadist_get_iparam(A, 0), chameleon_desc_datadist_get_iparam(A, 1), shift, np_involved, p_first ); + chameleon_desc_datadist_get_iparam(A, 0), + chameleon_desc_datadist_get_iparam(A, 1), + shift, np_involved, p_first ); shift = shift << 1; np_iter = chameleon_ceil( np_iter, 2 ); @@ -265,6 +288,7 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -280,7 +304,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, switch( alg ) { case ChamStarPUTasks: default: - zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); + zperm_allreduce_chameleon_starpu_task( options, dir, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); } } @@ -307,33 +331,51 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, int np, int *proc_involved ) { - int p; + int p; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } for ( p = 0; p < np; p++ ) { if ( proc_involved[ p ] == myrank ) { continue; } starpu_mpi_get_data_on_node_detached( options->sequence->comm, - RUNTIME_perm_getaddr( ipiv, ipivk ), + ipiv_handle, proc_involved[ p ], NULL, NULL ); } } void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int k, int n ) { - int b, rank; + int b, rank; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } for ( b = k+1; (b < A->mt) && ((b-(k+1)) < chameleon_desc_datadist_get_iparam(A, 0)); b ++ ) { rank = A->get_rankof( A, b, n ); @@ -341,10 +383,11 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, continue; } starpu_mpi_get_data_on_node_detached( options->sequence->comm, - RUNTIME_invp_getaddr( ipiv, ipivk ), + ipiv_handle, rank, NULL, NULL ); } } + #else void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, @@ -353,7 +396,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, int An, int myrank, int np, - int *proc_involved ) + int *proc_involved ) { (void)options; (void)A; @@ -366,11 +409,12 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, int np, - int *proc_involved ) + int *proc_involved ) { (void)options; (void)ipiv; @@ -382,6 +426,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -398,6 +443,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -419,4 +465,5 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, (void)n; (void)ws; } + #endif diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 81268734d901cbdf99cf27592207b53046423bf6..2221ee12853a7d47c04541f68dea65e3d39b953c 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -26,7 +26,8 @@ # @author Alycia Lisito # @author Matthieu Kuhn # @author Abel Calluaud -# @date 2025-01-24 +# @author Matteo Marcos +# @date 2025-03-24 # ### @@ -52,6 +53,8 @@ set(ZSRC_W_STDAPI testing_zlantr.c testing_zgemm.c testing_zgetrf.c + testing_zgesv.c + testing_zgetrs.c testing_zhemm.c testing_zherk.c testing_zher2k.c @@ -81,6 +84,7 @@ set(ZSRC_WO_STDAPI testing_zgenm2.c testing_zgesv_nopiv.c testing_zgesvd.c + testing_zlaswp.c testing_zgetrf_nopiv.c testing_zgetrs_nopiv.c testing_zgeqrf.c diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 39b7e89e04daf060dcacc87d17b851ee83e3191d..297d2628a1d114bc9ec2472b1caf2ad381fa442c 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -110,8 +110,13 @@ if (NOT CHAMELEON_SIMULATION) add_test( test_${cat}_${prec}getrf_ppivblocked_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppivblocked_batch PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=3" ) - + add_test( test_${cat}_${prec}laswp ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/laswp.in ) + add_test( test_${cat}_${prec}getrs ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrs.in ) + add_test( test_${cat}_${prec}gesv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/gesv.in ) if ( ${cat} STREQUAL "mpi" ) + add_test( test_${cat}_${prec}laswp_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/laswp.in ) + add_test( test_${cat}_${prec}getrs_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/getrs.in ) + add_test( test_${cat}_${prec}gesv_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/gesv.in ) add_test( test_${cat}_${prec}getrf_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppiv_comm_with_task PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0;CHAMELEON_GETRF_ALL_REDUCE=cham_spu_tasks" ) diff --git a/testing/chameleon_ztesting.c b/testing/chameleon_ztesting.c index 52f552a7c69d947b4fb25a336603bb8f4cebbc84..979abaf12ef505f892758a130a7d79da7252172d 100644 --- a/testing/chameleon_ztesting.c +++ b/testing/chameleon_ztesting.c @@ -22,7 +22,8 @@ * @author Lucas Nesi * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2025-01-15 + * @author Matteo Marcos + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -111,10 +112,11 @@ parameter_t parameters[] = { { "trans", "Value of the trans parameter ('ConjTrans', 'Trans', 'NoTrans')", -11, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, { "transA", "Value of the transA parameter ('ConjTrans', 'Trans', 'NoTrans')", -12, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, { "transB", "Value of the transB parameter ('ConjTrans', 'Trans', 'NoTrans')", -13, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, - { "uplo", "Value of the uplo parameter ('Upper', 'Lower', 'UpperLower')", -14, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 7, TestUplo, {0}, NULL, pread_uplo, sprint_uplo }, - { "diag", "Value of the diag parameter ('NonUnit', 'Unit')", -15, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 7, TestDiag, {0}, NULL, pread_diag, sprint_diag }, - { "side", "Value of the side parameter ('Left', 'Right')", -16, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestSide, {0}, NULL, pread_side, sprint_side }, - { "norm", "Value of the norm parameter ('One', 'Frobenius', 'Inf', 'Max')", -17, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestNormtype, {0}, NULL, pread_norm, sprint_norm }, + { "uplo", "Value of the uplo parameter ('Upper', 'Lower', 'UpperLower')", -14, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 7, TestUplo, {0}, NULL, pread_uplo, sprint_uplo }, + { "diag", "Value of the diag parameter ('NonUnit', 'Unit')", -15, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 7, TestDiag, {0}, NULL, pread_diag, sprint_diag }, + { "side", "Value of the side parameter ('Left', 'Right')", -16, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestSide, {0}, NULL, pread_side, sprint_side }, + { "norm", "Value of the norm parameter ('One', 'Frobenius', 'Inf', 'Max')", -17, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestNormtype, {0}, NULL, pread_norm, sprint_norm }, + { "dir", "Value of the dir parameter ('Forward', 'Backward')", -18, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 7, TestDir, {0}, NULL, pread_dir, sprint_dir }, { NULL, "Operation specific scalar", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "alpha", "Value of the scalar alpha", 'x', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, @@ -131,6 +133,10 @@ parameter_t parameters[] = { { NULL, "SVD parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "jobu", "Value of the jobu parameter ('NoVec', 'Vec', 'Ivec', 'AllVec', 'SVec', 'OVec')", -50, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestJob, {0}, NULL, pread_job, sprint_job }, { "jobvt", "Value of the jobvt parameter ('NoVec', 'Vec', 'Ivec', 'AllVec', 'SVec', 'OVec')", -51, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestJob, {0}, NULL, pread_job, sprint_job }, + + { NULL, "LASWP parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, + { "k1", "Index of the first element to permute", -70, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "k2", "Index of the last element to permute", -71, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int }, #endif { "tsub", "Graph submission time in s", 999, PARAM_OUTPUT, 2, 13, TestValFixdbl, {0}, NULL, pread_fixdbl, sprint_fixdbl }, diff --git a/testing/input/gesv.in b/testing/input/gesv.in new file mode 100644 index 0000000000000000000000000000000000000000..95ecf582575d4956ad8732e2b8fe54a923f90bea --- /dev/null +++ b/testing/input/gesv.in @@ -0,0 +1,20 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# GESV + +# nb: Tile size +# ib: Inner tile size +# n: Order of the matrix A and number of rows of matrix B +# nrhs: The number of columns of matrix B +# lda: Leading dimension of matrix A +# ldb: Leading dimension of matrix B + +op = gesv +nb = 4, 16, 17 +ib = 4, 12, 50 +n = 15, 21, 35 +nrhs = 1, 13, 22, 33 +lda = 40 +ldb = 41 + diff --git a/testing/input/getrs.in b/testing/input/getrs.in new file mode 100644 index 0000000000000000000000000000000000000000..9714143c4ddb4a953f2a9e756715a4ffe44b8735 --- /dev/null +++ b/testing/input/getrs.in @@ -0,0 +1,18 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# GETRS + +# nb: Tile size +# n: Order of the matrix A and number of rows of matrix B +# nrhs: The number of columns of matrix B +# lda: Leading dimension of matrix A +# ldb: Leading dimension of matrix B + +op = getrs +nb = 16, 17 +ib = 16, 17 +n = 15, 21, 35 +nrhs = 1, 13, 22, 33 +lda = 40 +ldb = 41 diff --git a/testing/input/laswp.in b/testing/input/laswp.in new file mode 100644 index 0000000000000000000000000000000000000000..41037f5e5cd0b79357b2d0df8beafc6e30c2b2a8 --- /dev/null +++ b/testing/input/laswp.in @@ -0,0 +1,20 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# LASWP + +# nb: Tile size +# n: Order of the matrix A +# lda: Leading dimension of matrix A +# k1: First element of ipiv to apply the permutation. +# k2: Last element of ipiv to apply the permutation. +# dir: Specifies the order of the permutation. + +op = laswp +nb = 4, 16, 17 +n = 15, 21, 35 +lda = 40 +k1 = 1, 2, 10 +k2 = 1, 2, 10 +dir = Forward, Backward + diff --git a/testing/run_list.c b/testing/run_list.c index a6900e88e637536a16d1cb65664195f9f62a2d07..a8fefce450a464200d804f20123f401b79bcf364 100644 --- a/testing/run_list.c +++ b/testing/run_list.c @@ -13,7 +13,8 @@ * @author Mathieu Faverge * @author Philippe Swartvagher * @author Alycia Lisito - * @date 2024-02-18 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "testings.h" @@ -442,6 +443,32 @@ run_arg_get_side( run_arg_list_t *arglist, const char *name, cham_side_t defval return rval.side; } +/** + * @brief Searches for a cham_dir_t value by its name. + * + * @param[inout] arglist + * The list of arguments. + * On exit, if the argument was not in the list, the default value is + * stored in it. + * + * @param[in] name + * The name of the argument to look for. + * + * @param[in] defval + * The default value if no argument is found with this name. This value + * is added to the list if not found. + * + * @retval The value of the argument _name_. + */ +cham_dir_t +run_arg_get_dir( run_arg_list_t *arglist, const char *name, cham_dir_t defval ) +{ + val_t val, rval; + val.dir = defval; + rval = run_arg_get( arglist, name, val ); + return rval.dir; +} + /** * @brief Searches for a cham_job_t value by its name. * diff --git a/testing/testing_zgesv.c b/testing/testing_zgesv.c new file mode 100644 index 0000000000000000000000000000000000000000..ac5ffe62194ecafeb589c215db6a83efd5c1f564 --- /dev/null +++ b/testing/testing_zgesv.c @@ -0,0 +1,256 @@ +/** + * + * @file testing_zgesv.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesv testing + * + * @version 1.3.0 + * @author Lucas Barros de Assis + * @author Mathieu Faverge + * @author Alycia Lisito + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> c d s + * + */ +#include "testings.h" +#include "chameleon/chameleon_z.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#include <chameleon/getenv.h> +#include <coreblas/lapacke.h> + +static cham_fixdbl_t +flops_zgesv( int N, int NRHS ) +{ + cham_fixdbl_t flops = flops_zgetrf( N, N ) + flops_zgetrs( N, NRHS ); + return flops; +} + +#if !defined(CHAMELEON_TESTINGS_VENDOR) +int +testing_zgesv_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_nb( args ); + int ib = run_arg_get_ib( args ); + int N = run_arg_get_int( args, "N", 1000 ); + int NRHS = run_arg_get_int( args, "NRHS", 1 ); + int LDA = run_arg_get_int( args, "LDA", N ); + int LDB = run_arg_get_int( args, "LDB", N ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + int seedB = run_arg_get_int( args, "seedB", testing_ialea() ); + + /* Descriptors */ + CHAM_desc_t *descA, *descX; + CHAM_ipiv_t *descIPIV; + void *wsA = NULL; + void *wsB = NULL; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + CHAMELEON_Set( CHAMELEON_INNER_BLOCK_SIZE, ib ); + + /* Creates the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, nb, nb, LDA, N, N, N ); + parameters_desc_create( "X", &descX, ChamComplexDouble, nb, nb, LDB, NRHS, N, NRHS ); + CHAMELEON_Ipiv_Create( &descIPIV, descA, NULL ); + + /* Fills the matrix with random values */ + CHAMELEON_zplrnt_Tile( descA, seedA ); + CHAMELEON_zplrnt_Tile( descX, seedB ); + + if ( async ) { + wsA = CHAMELEON_zgetrf_WS_Alloc( descA ); + wsB = CHAMELEON_zgetrf_WS_Alloc( descX ); + } + + /* Calculates the solution */ + testing_start( &test_data ); + if ( async ) { + hres = CHAMELEON_zgesv_Tile_Async( descA, descIPIV, descX, wsA, wsB, + test_data.sequence, &test_data.request ); + CHAMELEON_Desc_Flush( descA, test_data.sequence ); + CHAMELEON_Desc_Flush( descX, test_data.sequence ); + CHAMELEON_Ipiv_Flush( descIPIV, test_data.sequence ); + } + else { + hres = CHAMELEON_zgesv_Tile( descA, descIPIV, descX ); + } + test_data.hres = hres; + testing_stop( &test_data, flops_zgesv( N, NRHS ) ); + + if ( async ) { + CHAMELEON_zgetrf_WS_Free( wsA ); + CHAMELEON_zgetrf_WS_Free( wsB ); + } + + /* Checks the factorisation and the residual */ + if ( check ) { + CHAM_desc_t *descA0, *descB; + + /* Check the factorization */ + descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); + CHAMELEON_zplrnt_Tile( descA0, seedA ); + + CHAMELEON_zlaswp_Tile( ChamLeft, ChamDirForward, descA0, 1, N, descIPIV ); + + hres += check_zxxtrf( args, ChamGeneral, ChamUpperLower, descA0, descA ); + + if ( hres ) { + CHAMELEON_Desc_Destroy( &descA0 ); + CHAMELEON_Ipiv_Destroy( &descIPIV, descA ); + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descX ); + return hres; + } + + /* Check the solve */ + descB = CHAMELEON_Desc_Copy( descX, CHAMELEON_MAT_ALLOC_TILE ); + CHAMELEON_zplrnt_Tile( descB, seedB ); + + CHAMELEON_zplrnt_Tile( descA0, seedA ); + hres += check_zsolve( args, ChamGeneral, ChamNoTrans, ChamUpperLower, descA0, descX, descB ); + + CHAMELEON_Desc_Destroy( &descA0 ); + CHAMELEON_Desc_Destroy( &descB ); + } + + CHAMELEON_Ipiv_Destroy( &descIPIV, descA ); + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descX ); + + return hres; +} +#endif + +int +testing_zgesv_std( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ +#if !defined(CHAMELEON_TESTINGS_VENDOR) + int api = parameters_getvalue_int( "api" ); +#endif + int nb = run_arg_get_nb( args ); + int ib = run_arg_get_ib( args ); + int N = run_arg_get_int( args, "N", 1000 ); + int NRHS = run_arg_get_int( args, "NRHS", 1 ); + int LDA = run_arg_get_int( args, "LDA", N ); + int LDB = run_arg_get_int( args, "LDB", N ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + int seedB = run_arg_get_int( args, "seedB", testing_ialea() ); + + /* Descriptors */ + CHAMELEON_Complex64_t *A, *X; + int *IPIV; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + CHAMELEON_Set( CHAMELEON_INNER_BLOCK_SIZE, ib ); + + /* Creates the matrices */ + A = malloc( sizeof(CHAMELEON_Complex64_t) * LDA*N ); + X = malloc( sizeof(CHAMELEON_Complex64_t) * LDB*NRHS ); + IPIV = malloc( sizeof(int) * N ); + + /* Fills the matrix with random values */ + CHAMELEON_zplrnt( N, N, A, LDA, seedA ); + CHAMELEON_zplrnt( N, NRHS, X, LDB, seedB ); + + /* Calculates the solution */ +#if defined(CHAMELEON_TESTINGS_VENDOR) + testing_start( &test_data ); + hres = LAPACKE_zgesv( LAPACK_COL_MAJOR, N, NRHS, A, LDA, IPIV, X, LDB ); + test_data.hres = hres; + testing_stop( &test_data, flops_zgesv( N, NRHS ) ); +#else + testing_start( &test_data ); + switch ( api ) { + case 1: + hres = CHAMELEON_zgesv( N, NRHS, A, LDA, IPIV, X, LDB ); + break; +#if !defined(CHAMELEON_SIMULATION) && 0 + case 2: + CHAMELEON_lapacke_zgesv( CblasColMajor, chameleon_lapack_const(uplo), N, NRHS, A, LDA, X, LDB ); + break; +#endif + default: + if ( CHAMELEON_Comm_rank() == 0 ) { + fprintf( stderr, + "SKIPPED: This function can only be used with the option --api 1 or --api 2.\n" ); + } + return -1; + } + test_data.hres = hres; + testing_stop( &test_data, flops_zgesv( N, NRHS ) ); + + /* Checks the factorisation and residual */ + if ( check ) { + CHAMELEON_Complex64_t *A0 = malloc( sizeof(CHAMELEON_Complex64_t) * LDA*N ); + CHAMELEON_Complex64_t *B = malloc( sizeof(CHAMELEON_Complex64_t) * LDB*NRHS ); + + /* Check the factorization */ + CHAMELEON_zplrnt( N, N, A0, LDA, seedA ); + CHAMELEON_zlaswp( ChamLeft, ChamDirForward, N, N, A0, LDA, 1, N, IPIV ); + + hres += check_zxxtrf_std( args, ChamGeneral, ChamUpperLower, N, N, A0, A, LDA ); + + /* Check the solve */ + CHAMELEON_zplrnt( N, N, A0, LDA, seedA ); + CHAMELEON_zplrnt( N, NRHS, B, LDB, seedB ); + hres += check_zsolve_std( args, ChamGeneral, ChamNoTrans, ChamUpperLower, N, NRHS, A0, LDA, X, B, LDB ); + + free( A0 ); + free( B ); + } +#endif + + free( A ); + free( X ); + + (void)check; + return hres; +} + +testing_t test_zgesv; +#if defined(CHAMELEON_TESTINGS_VENDOR) +const char *zgesv_params[] = { "n", "nrhs", "lda", "ldb", "seedA", "seedB", NULL }; +#else +const char *zgesv_params[] = { "mtxfmt", "nb", "ib", "n", "nrhs", "lda", "ldb", "seedA", "seedB", NULL }; +#endif +const char *zgesv_output[] = { NULL }; +const char *zgesv_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgesv_init( void ) __attribute__( ( constructor ) ); +void +testing_zgesv_init( void ) +{ + test_zgesv.name = "zgesv"; + test_zgesv.helper = "General linear system solve (LU with partial pivoting)"; + test_zgesv.params = zgesv_params; + test_zgesv.output = zgesv_output; + test_zgesv.outchk = zgesv_outchk; +#if defined(CHAMELEON_TESTINGS_VENDOR) + test_zgesv.fptr_desc = NULL; +#else + test_zgesv.fptr_desc = testing_zgesv_desc; +#endif + test_zgesv.fptr_std = testing_zgesv_std; + test_zgesv.next = NULL; + + testing_register( &test_zgesv ); +} + diff --git a/testing/testing_zgetrf.c b/testing/testing_zgetrf.c index 1db2d0030f335ea521bc12bf22ae37a830c920cc..5e489b7f7c1e1f8c66e3c9f7ba86f3308c3fb75e 100644 --- a/testing/testing_zgetrf.c +++ b/testing/testing_zgetrf.c @@ -17,7 +17,8 @@ * @author Lionel Eyraud-Dubois * @author Xavier Lacoste * @author Florent Pruvost - * @date 2025-01-29 + * @author Matteo Marcos + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -106,47 +107,24 @@ testing_zgetrf_desc( run_arg_list_t *args, int check ) testing_stop( &test_data, flops_zgetrf( M, N ) ); /* Checks the factorization and residual */ -#if !defined(CHAMELEON_SIMULATION) if ( check ) { - CHAM_desc_t *descA0c; CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); - /* Create A0c as local to rank 0 on all nodes to gather the matrix */ - CHAMELEON_Desc_Create_User( - &descA0c, (void*)CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, - nb, nb, nb*nb, M, N, 0, 0, M, N, 1, 1, - chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL, NULL ); - if ( diag == ChamUnit ) { - CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA0c, seedA ); - CHAMELEON_zplgtr_Tile( minMN, ChamLower, descA0c, seedA+1 ); - } - else { - CHAMELEON_zplrnt_Tile( descA0c, seedA ); - } - - /* Compute the permutation of A0: P * A0 */ - if ( CHAMELEON_Comm_rank() == 0 ) { - int *ipiv; - - ipiv = malloc( sizeof(int) * minMN ); - CHAMELEON_Ipiv_Gather( descIPIV, ipiv, 0 ); - LAPACKE_zlaswp( LAPACK_COL_MAJOR, N, descA0c->mat, M, 1, minMN, ipiv, 1 ); - free( ipiv ); + CHAMELEON_zplgtr_Tile( 0, ChamUpper, descA0, seedA ); + CHAMELEON_zplgtr_Tile( minMN, ChamLower, descA0, seedA+1 ); } else { - CHAMELEON_Ipiv_Gather( descIPIV, NULL, 0 ); + CHAMELEON_zplrnt_Tile( descA0, seedA ); } - CHAMELEON_zlacpy_Tile( ChamUpperLower, descA0c, descA0 ); - CHAMELEON_Desc_Destroy( &descA0c ); + CHAMELEON_zlaswp_Tile( ChamLeft, ChamDirForward, descA0, 1, descA0->m, descIPIV ); hres += check_zxxtrf( args, ChamGeneral, ChamUpperLower, descA0, descA ); CHAMELEON_Desc_Destroy( &descA0 ); } -#endif /* !defined(CHAMELEON_SIMULATION) */ if ( ws != NULL ) { CHAMELEON_zgetrf_WS_Free( ws ); @@ -223,7 +201,7 @@ testing_zgetrf_std( run_arg_list_t *args, int check ) CHAMELEON_zplrnt( M, N, A0, LDA, seedA ); /* Compute the permutation of A0: P * A0 */ - LAPACKE_zlaswp( LAPACK_COL_MAJOR, N, A0, M, 1, minMN, IPIV, 1 ); + CHAMELEON_zlaswp( ChamLeft, ChamDirForward, M, N, A0, 1, minMN, minMN, IPIV ); hres += check_zxxtrf_std( args, ChamGeneral, ChamUpperLower, M, N, A0, A, LDA ); diff --git a/testing/testing_zgetrs.c b/testing/testing_zgetrs.c new file mode 100644 index 0000000000000000000000000000000000000000..4a3713be3db61d974b151d1100edb63dac512495 --- /dev/null +++ b/testing/testing_zgetrs.c @@ -0,0 +1,230 @@ +/** + * + * @file testing_zgetrs.c + * + * @copyright 2019-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgetrf testing + * + * @version 1.3.0 + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include <chameleon_lapack.h> +#include "chameleon/chameleon_z.h" +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#include <chameleon/getenv.h> +#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION) +#include <coreblas.h> +#include <coreblas/lapacke.h> +#endif + +#if !defined(CHAMELEON_TESTINGS_VENDOR) +int +testing_zgetrs_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_nb( args ); + int ib = run_arg_get_ib( args ); + int N = run_arg_get_int( args, "N", 1000 ); + int NRHS = run_arg_get_int( args, "NRHS", 1 ); + int LDA = run_arg_get_int( args, "LDA", N ); + int LDB = run_arg_get_int( args, "LDB", N ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + int seedB = run_arg_get_int( args, "seedB", testing_ialea() ); + + /* Descriptors */ + CHAM_desc_t *descA, *descX; + CHAM_ipiv_t *descIPIV; + void *ws = NULL; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + CHAMELEON_Set( CHAMELEON_INNER_BLOCK_SIZE, ib ); + + /* Creates the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, nb, nb, LDA, N, N, N ); + parameters_desc_create( "X", &descX, ChamComplexDouble, nb, nb, LDB, NRHS, N, NRHS ); + CHAMELEON_Ipiv_Create( &descIPIV, descA, NULL ); + + CHAMELEON_zplrnt_Tile( descA, seedA ); + CHAMELEON_zplrnt_Tile( descX, seedB ); + + CHAMELEON_zgetrf_Tile( descA, descIPIV ); + + if ( async ) { + ws = CHAMELEON_zgetrf_WS_Alloc( descX ); + } + + /* Calculates the solution */ + testing_start( &test_data ); + if ( async ) { + hres = CHAMELEON_zgetrs_Tile_Async( ChamNoTrans, descA, descIPIV, descX, ws, test_data.sequence, &test_data.request ); + CHAMELEON_Desc_Flush( descA, test_data.sequence ); + CHAMELEON_Ipiv_Flush( descIPIV, test_data.sequence ); + } + else { + hres = CHAMELEON_zgetrs_Tile( ChamNoTrans, descA, descIPIV, descX ); + } + test_data.hres = hres; + testing_stop( &test_data, flops_zgetrs( N, NRHS ) ); + + /* Checks the factorization and residual */ +#if !defined(CHAMELEON_SIMULATION) + if ( check ) { + CHAM_desc_t *descA0, *descB; + + descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); + descB = CHAMELEON_Desc_Copy( descX, CHAMELEON_MAT_ALLOC_TILE ); + + CHAMELEON_zplrnt_Tile( descA0, seedA ); + CHAMELEON_zplrnt_Tile( descB, seedB ); + + hres += check_zsolve( args, ChamGeneral, ChamNoTrans, ChamUpperLower, descA0, descX, descB ); + + CHAMELEON_Desc_Destroy( &descA0 ); + CHAMELEON_Desc_Destroy( &descB ); + } +#endif /* !defined(CHAMELEON_SIMULATION) */ + + if ( ws != NULL ) { + CHAMELEON_zgetrf_WS_Free( ws ); + } + + CHAMELEON_Ipiv_Destroy( &descIPIV, descA ); + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descX ); + + return hres; +} +#endif + +int +testing_zgetrs_std( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ +#if !defined(CHAMELEON_TESTINGS_VENDOR) + int api = parameters_getvalue_int( "api" ); +#endif + int nb = run_arg_get_nb( args ); + int N = run_arg_get_int( args, "N", 1000 ); + int NRHS = run_arg_get_int( args, "NRHS", 1 ); + int LDA = run_arg_get_int( args, "LDA", N ); + int LDB = run_arg_get_int( args, "LDB", N ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + int seedB = run_arg_get_int( args, "seedB", testing_ialea() ); + + /* Descriptors */ + CHAMELEON_Complex64_t *A, *X; + int *IPIV; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Creates the matrices */ + A = malloc( sizeof(CHAMELEON_Complex64_t) * LDA*N ); + X = malloc( sizeof(CHAMELEON_Complex64_t) * LDB*NRHS ); + IPIV = malloc( sizeof(int) * N ); + + /* Fills the matrix with random values */ + CHAMELEON_zplrnt( N, N, A, LDA, seedA ); + CHAMELEON_zplrnt( N, NRHS, X, LDB, seedB ); + + CHAMELEON_zgetrf( N, N, A, LDA, IPIV ); + + /* Calculates the solution */ +#if defined(CHAMELEON_TESTINGS_VENDOR) + testing_start( &test_data ); + hres = LAPACKE_zgetrs( LAPACK_COL_MAJOR, 'N', N, NRHS, A, LDA, IPIV, X, LDB ); + test_data.hres = hres; + testing_stop( &test_data, flops_zgetrs( N, NRHS ) ); +#else + testing_start( &test_data ); + switch ( api ) { + case 1: + hres = CHAMELEON_zgetrs( ChamNoTrans, N, NRHS, A, LDA, IPIV, X, LDB); + break; +#if !defined(CHAMELEON_SIMULATION) & 0 + case 2: + CHAMELEON_lapacke_zgetrs( CblasColMajor, N, NRHS, A, LDA, IPIV, B, LDB ); + break; +#endif + default: + if ( CHAMELEON_Comm_rank() == 0 ) { + fprintf( stderr, + "SKIPPED: This function can only be used with the option --api 1 or --api 2.\n" ); + } + return -1; + } + test_data.hres = hres; + testing_stop( &test_data, flops_zgetrs( N, NRHS ) ); + +#if !defined(CHAMELEON_SIMULATION) + /* Checks the factorisation and residue */ + if ( check ) { + CHAMELEON_Complex64_t *A0 = malloc( sizeof(CHAMELEON_Complex64_t) * LDA*N ); + CHAMELEON_Complex64_t *B = malloc( sizeof(CHAMELEON_Complex64_t) * LDB*NRHS ); + + CHAMELEON_zplrnt( N, N, A0, LDA, seedA ); + CHAMELEON_zplrnt( N, NRHS, B, LDB, seedB ); + + hres += check_zsolve_std( args, ChamGeneral, ChamNoTrans, ChamUpperLower, N, NRHS, A0, LDA, X, B, LDB ); + + free( A0 ); + free( B ); + } +#endif +#endif + + free ( IPIV ); + free( A ); + free( X ); + + (void)check; + return hres; +} + +testing_t test_zgetrs; +#if defined(CHAMELEON_TESTINGS_VENDOR) +const char *zgetrs_params[] = { "m", "n", "lda", "seedA", NULL }; +#else +const char *zgetrs_params[] = { "mtxfmt", "nb", "ib", "n", "nrhs", "lda", "ldb", "seedA", "seedB", NULL }; +#endif +const char *zgetrs_output[] = { NULL }; +const char *zgetrs_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgetrs_init( void ) __attribute__( ( constructor ) ); +void +testing_zgetrs_init( void ) +{ + test_zgetrs.name = "zgetrs"; + test_zgetrs.helper = "General triangular solve (LU with partial pivoting)"; + test_zgetrs.params = zgetrs_params; + test_zgetrs.output = zgetrs_output; + test_zgetrs.outchk = zgetrs_outchk; +#if defined(CHAMELEON_TESTINGS_VENDOR) + test_zgetrs.fptr_desc = NULL; +#else + test_zgetrs.fptr_desc = testing_zgetrs_desc; +#endif + test_zgetrs.fptr_std = testing_zgetrs_std; + test_zgetrs.next = NULL; + + testing_register( &test_zgetrs ); +} diff --git a/testing/testing_zlaswp.c b/testing/testing_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..56c7a8a84afcacae9b143ecdfeb24369f9713668 --- /dev/null +++ b/testing/testing_zlaswp.c @@ -0,0 +1,141 @@ +/** + * + * @file testing_zlaswp.c + * + * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zlaswp testing + * + * @version 1.3.0 + * @author Matteo Marcos + * @date 2025-03-24 + * @precisions normal z -> c d s + * + */ +#include "chameleon/constants.h" +#include "chameleon/struct.h" +#include "testings.h" +#include "chameleon/chameleon_z.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#include <chameleon/getenv.h> +#include <coreblas/lapacke.h> +#include <chameleon/tasks.h> + +static void testing_zlaswp_ipiv_gen( int *IPIV, + int M ) +{ + int i; + + for ( i = 0; i < M; i++ ) { + IPIV[i] = testing_ialea() % ( M - i ) + i + 1; + } +} + +int +testing_zlaswp_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + cham_side_t side = run_arg_get_side( args, "side", ChamLeft ); + cham_dir_t dir = run_arg_get_dir( args, "dir", ChamDirForward ); + int nb = run_arg_get_nb( args ); + int N = run_arg_get_int( args, "N", 1000 ); + int M = run_arg_get_int( args, "M", N ); + int LDA = run_arg_get_int( args, "LDA", N ); + int seedA = run_arg_get_int( args, "seedA", testing_ialea() ); + int K1 = run_arg_get_int( args, "K1", 1 ); + int K2 = run_arg_get_int( args, "K2", M ); + + int *IPIV = malloc( sizeof(int) * M ); + + /* Descriptors */ + CHAM_desc_t *descA; + CHAM_ipiv_t *descIPIV; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Creates the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, nb, nb, LDA, N, M, N ); + CHAMELEON_zplrnt_Tile( descA, seedA ); + + testing_zlaswp_ipiv_gen( IPIV, M ); + CHAMELEON_Ipiv_Create( &descIPIV, descA, IPIV ); + CHAMELEON_Ipiv_Init( descA, descIPIV ); + + /* Calculates the solution */ + testing_start( &test_data ); + if ( async ) { + hres = CHAMELEON_zlaswp_Tile_Async( side, dir, descA, K1, K2, descIPIV, test_data.sequence, &test_data.request ); + CHAMELEON_Desc_Flush( descA, test_data.sequence ); + CHAMELEON_Ipiv_Flush( descIPIV, test_data.sequence ); + } + else { + hres = CHAMELEON_zlaswp_Tile( side, dir, descA, K1, K2, descIPIV ); + } + test_data.hres = hres; + testing_stop( &test_data, 0 ); + +#if !defined(CHAMELEON_SIMULATION) + if ( check ) { + CHAM_desc_t *descA0, *descA0c; + int INCX = ( dir == ChamDirForward ) ? 1 : -1; + + descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); + + CHAMELEON_Desc_Create_User( + &descA0c, (void*)CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, + nb, nb, nb*nb, M, N, 0, 0, M, N, 1, 1, + chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL, NULL ); + + CHAMELEON_zplrnt_Tile( descA0c, seedA ); + + if ( CHAMELEON_Comm_rank() == 0 ) { + LAPACKE_zlaswp( LAPACK_COL_MAJOR, N, descA0c->mat, M, K1, K2, IPIV, INCX ); + } + + CHAMELEON_zlacpy_Tile( ChamUpperLower, descA0c, descA0 ); + CHAMELEON_Desc_Destroy( &descA0c ); + + hres += check_zmatrices( args, ChamUpperLower, descA, descA0 ); + + CHAMELEON_Desc_Destroy( &descA0 ); + } +#endif /* !defined(CHAMELEON_SIMULATION) */ + + CHAMELEON_Ipiv_Destroy( &descIPIV, descA ); + parameters_desc_destroy( &descA ); + free( IPIV ); + + return hres; +} + +testing_t test_zlaswp; +const char *zlaswp_params[] = { "mtxfmt", "nb", "n", "m", "lda", "seedA", "k1", "k2", "side", "dir", NULL }; +const char *zlaswp_output[] = { NULL }; +const char *zlaswp_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zlaswp_init( void ) __attribute__( ( constructor ) ); +void +testing_zlaswp_init( void ) +{ + test_zlaswp.name = "zlaswp"; + test_zlaswp.helper = "Row interchange on general matrices"; + test_zlaswp.params = zlaswp_params; + test_zlaswp.output = zlaswp_output; + test_zlaswp.outchk = zlaswp_outchk; + test_zlaswp.fptr_desc = testing_zlaswp_desc; + test_zlaswp.next = NULL; + + testing_register( &test_zlaswp ); +} + diff --git a/testing/testing_zposv.c b/testing/testing_zposv.c index c2a739a9d263cc1c95e4a949bf93ac9718be443c..21237c75a5be31d75b45f46ccea89ff8bc3c42c4 100644 --- a/testing/testing_zposv.c +++ b/testing/testing_zposv.c @@ -14,7 +14,7 @@ * @author Florent Pruvost * @author Mathieu Faverge * @author Alycia Lisito - * @date 2025-01-29 + * @date 2025-03-21 * @precisions normal z -> c d s * */ @@ -80,7 +80,7 @@ testing_zposv_desc( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zposv( N, NRHS ) ); - /* Checks the factorisation and residue */ + /* Checks the factorisation and the residual */ if ( check ) { CHAM_desc_t *descA0, *descB; @@ -167,7 +167,7 @@ testing_zposv_std( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zposv( N, NRHS ) ); - /* Checks the factorisation and residue */ + /* Checks the factorisation and residual */ if ( check ) { CHAMELEON_Complex64_t *A0 = malloc( sizeof(CHAMELEON_Complex64_t) * LDA*N ); CHAMELEON_Complex64_t *B = malloc( sizeof(CHAMELEON_Complex64_t) * LDB*NRHS ); diff --git a/testing/testings.h b/testing/testings.h index d341be66bbe4fe56964aea72fa2d4eeb4d8ec042..25283a6630e2c673cecda41380bbc32544b6085f 100644 --- a/testing/testings.h +++ b/testing/testings.h @@ -14,7 +14,8 @@ * @author Alycia Lisito * @author Florent Pruvost * @author Lionel Eyraud-Dubois - * @date 2024-03-21 + * @author Matteo Marcos + * @date 2025-03-24 * */ #ifndef _testings_h_ @@ -45,6 +46,7 @@ typedef enum valtype_ { TestUplo, TestDiag, TestSide, + TestDir, TestJob, TestNormtype, TestString, @@ -59,6 +61,7 @@ union val_u { cham_uplo_t uplo; cham_diag_t diag; cham_side_t side; + cham_dir_t dir; cham_job_t job; cham_normtype_t ntype; CHAMELEON_Complex64_t zval; @@ -168,6 +171,7 @@ val_t pread_trans ( const char *str ); val_t pread_uplo ( const char *str ); val_t pread_diag ( const char *str ); val_t pread_side ( const char *str ); +val_t pread_dir ( const char *str ); val_t pread_job ( const char *str ); val_t pread_norm ( const char *str ); val_t pread_string ( const char *str ); @@ -185,6 +189,7 @@ char *sprint_trans ( val_t val, int human, int nbchar, char *str_in ); char *sprint_uplo ( val_t val, int human, int nbchar, char *str_in ); char *sprint_diag ( val_t val, int human, int nbchar, char *str_in ); char *sprint_side ( val_t val, int human, int nbchar, char *str_in ); +char *sprint_dir ( val_t val, int human, int nbchar, char *str_in ); char *sprint_job ( val_t val, int human, int nbchar, char *str_in ); char *sprint_norm ( val_t val, int human, int nbchar, char *str_in ); char *sprint_string ( val_t val, int human, int nbchar, char *str_in ); @@ -209,6 +214,7 @@ cham_trans_t run_arg_get_trans ( run_arg_list_t *arglist, const char cham_uplo_t run_arg_get_uplo ( run_arg_list_t *arglist, const char *name, cham_uplo_t defval ); cham_diag_t run_arg_get_diag ( run_arg_list_t *arglist, const char *name, cham_diag_t defval ); cham_side_t run_arg_get_side ( run_arg_list_t *arglist, const char *name, cham_side_t defval ); +cham_dir_t run_arg_get_dir ( run_arg_list_t *arglist, const char *name, cham_dir_t defval ); cham_job_t run_arg_get_job ( run_arg_list_t *arglist, const char *name, cham_job_t defval ); cham_normtype_t run_arg_get_ntype ( run_arg_list_t *arglist, const char *name, cham_normtype_t defval ); int run_arg_get_ib ( run_arg_list_t *arglist ); diff --git a/testing/values.c b/testing/values.c index 3f3ed0bbca83a1a82c6b5e660cf3705e2251d7c2..36e42b256569d32f5a3b9740cc784553b6527345 100644 --- a/testing/values.c +++ b/testing/values.c @@ -14,7 +14,8 @@ * @author Alycia Lisito * @author Florent Pruvost * @author Philippe Swartvagher - * @date 2024-08-28 + * @author Matteo Marcos + * @date 2025-03-24 * */ #include "testings.h" @@ -266,6 +267,41 @@ val_t pread_side( const char *str ) return val; } + +/** + * @brief Convert the input string to a cham_dir_t + * @param[in] str + * The input string + * @return The cham_dir_t read. + */ +val_t pread_dir( const char *str ) +{ + val_t val; + val.dir = ChamDirForward; + + /* Keep in sync with help documentation in testing/{chameleon,vendor}_ztesting.c */ + if ( ( strcasecmp( "ChamDirForward", str ) == 0 ) || + ( strcasecmp( "Forward", str ) == 0 ) ) + { + val.dir = ChamDirForward; + } + else if ( ( strcasecmp( "ChamDirBackward", str ) == 0 ) || + ( strcasecmp( "Backward", str ) == 0 ) ) + { + val.dir = ChamDirBackward; + } + else { + int v = atoi( str ); + if ( ( v == ChamDirForward ) || ( v == ( ChamDirForward - ChamDirBackward ) ) ) { + val.dir = ChamDirForward; + } + else { + val.dir = ChamDirBackward; + } + } + return val; +} + /** * @brief Convert the input string to a cham_job_t * @param[in] str @@ -582,6 +618,25 @@ char *sprint_side( val_t val, int human, int nbchar, char *str_in ) return str_in+rc; } +/** + * @brief Convert the input string to a cham_dir_t + * @param[in] str + * The input string + * @return The cham_dir_t read. + */ +char *sprint_dir( val_t val, int human, int nbchar, char *str_in ) +{ + int rc; + if ( human ) { + rc = sprintf( str_in, " %-*s", nbchar, + (val.dir == ChamDirForward) ? "Forward" : "Backward" ); + } + else { + rc = sprintf( str_in, ";%d", val.dir ); + } + return str_in+rc; +} + /** * @brief Convert the input string to a cham_job_t * @param[in] str