diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 1060432565ee5d581cd82bc74da7c53df0829383..94dee2b5b617dc8d8bb4f247694afdb897226bfa 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -16,7 +16,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * @precisions normal z -> s d c * */ @@ -154,6 +154,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, } /* Flush temporary data used for the pivoting */ + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); } @@ -191,20 +192,59 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, static inline void chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, int k, int n, RUNTIME_option_t *options ) { - (void)ws; - (void)A; - (void)k; - (void)n; - (void)options; + switch( ws->alg ) { + case ChamGetrfPPiv: + chameleon_attr_fallthrough; + case ChamGetrfPPivPerColumn: + { + int m; + int tempkm, tempkn, tempnn, minmn; + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Extract selected rows into U */ + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + A(k, n), U(k, n) ); + + /* + * perm array is made of size tempkm for the first row espacially. + * Otherwise, the final copy back to the tile may copy only a partial tile + */ + INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, + ipiv, k, A(k, n), U(k, n) ); + + for(m=k+1; m<A->mt; m++){ + /* Extract selected rows into A(k, n) */ + INSERT_TASK_zlaswp_get( options, m*A->mb, minmn, + ipiv, k, A(m, n), U(k, n) ); + /* Copy rows from A(k,n) into their final position */ + INSERT_TASK_zlaswp_set( options, m*A->mb, minmn, + ipiv, k, A(k, n), A(m, n) ); + } + + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + U(k, n), A(k, n) ); + + RUNTIME_data_flush( options->sequence, U(k, n) ); + } + break; + default: + ; + } } static inline void chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, int k, int n, RUNTIME_option_t *options ) @@ -217,7 +257,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - chameleon_pzgetrf_panel_permute( ws, A, k, n, options ); + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); INSERT_TASK_ztrsm( options, @@ -270,7 +310,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; - chameleon_pzgetrf_panel_update( ws, A, k, n, &options ); + chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options ); } /* Flush panel k */ @@ -284,11 +324,12 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, /* Backward pivoting */ for (k = 1; k < min_mnt; k++) { for (n = 0; n < k; n++) { - chameleon_pzgetrf_panel_permute( ws, A, k, n, &options ); + chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); } + RUNTIME_perm_flushk( sequence, IPIV, k ); } - /* Initialize IPIV */ + /* Initialize IPIV with default values if needed */ if ( (ws->alg == ChamGetrfNoPivPerColumn) || (ws->alg == ChamGetrfNoPiv ) ) { diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 73c810be2c1f3294583b6599aff49e726f8f049d..98d5f0e08f7d14470a81c1898c0a2d096f73fdf9 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -19,7 +19,7 @@ * @author Florent Pruvost * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-08-22 + * @date 2023-08-31 * * @precisions normal z -> s d c * @@ -95,6 +95,15 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) A->mt, A->nt * A->nb, A->p, A->q, NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); } + else if ( ( ws->alg == ChamGetrfPPiv ) || + ( ws->alg == ChamGetrfPPivPerColumn ) ) + { + chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, A->mb, A->nb, A->mb*A->nb, + A->m, A->n, 0, 0, + A->m, A->n, A->p, A->q, + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); + } /* Set ib to 1 if per column algorithm */ if ( ( ws->alg == ChamGetrfNoPivPerColumn ) || @@ -130,7 +139,10 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) { struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)user_ws; - if ( ws->alg == ChamGetrfNoPivPerColumn ) { + if ( ( ws->alg == ChamGetrfNoPivPerColumn ) || + ( ws->alg == ChamGetrfPPiv ) || + ( ws->alg == ChamGetrfPPivPerColumn ) ) + { chameleon_desc_destroy( &(ws->U) ); } free( ws ); diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 7f89dc29ed96d240f310ac7032ba98d17fe2d90c..bec6c5aaf474aae765f84ac8f60e6d00c950fc62 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -17,14 +17,14 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge # @author Florent Pruvost # @author Guillaume Sylvand # @author Matthieu Kuhn -# @date 2022-02-22 +# @date 2023-08-31 # ### @@ -68,8 +68,9 @@ set(ZSRC core_zlanhe.c core_zlansy.c core_zlantr.c - core_zlaset2.c core_zlaset.c + core_zlaset2.c + core_zlaswp.c core_zlatro.c core_zlauum.c core_zpamm.c @@ -132,9 +133,10 @@ precisions_rules_py(COREBLAS_SRCS_GENERATED "${ZSRC}" PRECISIONS "${CHAMELEON_PRECISION}") set(COREBLAS_SRCS - global.c - ${COREBLAS_SRCS_GENERATED} - ) + global.c + core_ipiv_to_perm.c + ${COREBLAS_SRCS_GENERATED} +) # Force generation of sources # --------------------------- diff --git a/coreblas/compute/core_ipiv_to_perm.c b/coreblas/compute/core_ipiv_to_perm.c new file mode 100644 index 0000000000000000000000000000000000000000..290d1d1f801ebb9e5e29afcb76bdec8cb999f07a --- /dev/null +++ b/coreblas/compute/core_ipiv_to_perm.c @@ -0,0 +1,97 @@ +/** + * + * @file core_ipiv_to_perm.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_ipiv_to_perm CPU kernel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-08-31 + */ +#include "coreblas.h" + +/** + ******************************************************************************* + * + * The idea here is to generate a permutation from the sequence of + * pivot. To avoid storing one whole column at each step, we keep + * track of two vectors of nb elements, the first one contains the + * permutation of the first nb elements, and the second one contains + * the inverse permutation of those same elements. + * + * Lets have i the element to pivot with ip. ipiv[i] = ip; + * We set i_1 as such invp[ i_1 ] = i + * and ip_1 as such invp[ ip_1 ] = ip + * + * At each step we want to: + * - swap perm[i] and perm[ip] + * - set invp[i_1] to ip + * - set invp[ip_1] to i + * + ******************************************************************************* + * + * @param[in] m0 + * The base index for all values in ipiv, perm and invp. m0 >= 0. + * + * @param[in] m + * The number of elements in perm and invp. m >= 0. + * + * @param[in] k + * The number of elements in ipiv. k >= 0. + * + * @param[in] ipiv + * The pivot array of size n. This is a (m0+1)-based indices array to follow + * the Fortran standard. + * + * @param[out] perm + * The permutation array of the destination row indices (m0-based) of the [1,n] set of rows. + * + * @param[out] invp + * The permutation array of the origin row indices (m0-based) of the [1,n] set of rows. + * + */ +void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp ) +{ + int i, j, ip; + int i_1, ip_1; + + for(i=0; i < m; i++) { + perm[i] = i + m0; + invp[i] = i + m0; + } + + for(i = 0; i < k; i++) { + ip = ipiv[i]-1; + assert( ip - m0 >= i ); + + if ( ip - m0 > i ) { + + i_1 = perm[i]; + + if (ip-m0 < m) { + ip_1 = perm[ip-m0]; + perm[ip-m0] = i_1; + } else { + ip_1 = ip; + for(j=0; j < m; j++) { + if( invp[j] == ip ) { + ip_1 = j + m0; + break; + } + } + } + + perm[i] = ip_1; + i_1 -= m0; + ip_1 -= m0; + + if (i_1 < m) invp[i_1 ] = ip; + if (ip_1 < m) invp[ip_1] = i + m0; + } + } +} diff --git a/coreblas/compute/core_zgetrf.c b/coreblas/compute/core_zgetrf.c index 3c65462504d3792ca61a48d423b99b9efff0d89d..3089359dafb7dad2c4472cc0a03b527288e337f9 100644 --- a/coreblas/compute/core_zgetrf.c +++ b/coreblas/compute/core_zgetrf.c @@ -19,14 +19,13 @@ * @author Cedric Castagnede * @author Florent Pruvost * @author Matthieu Kuhn - * @date 2023-07-26 + * @date 2023-08-31 * @precisions normal z -> c d s * */ #include "coreblas/lapacke.h" #include "coreblas.h" - int CORE_zgetrf( int m, int n, CHAMELEON_Complex64_t *A, int lda, int *IPIV, int *info ) diff --git a/coreblas/compute/core_zgetrf_panel.c b/coreblas/compute/core_zgetrf_panel.c index 68911699b39b62aa2e12007048bab72311a620f6..f3467b1a3f750b0d87263fa2acda337ac3c00820 100644 --- a/coreblas/compute/core_zgetrf_panel.c +++ b/coreblas/compute/core_zgetrf_panel.c @@ -134,13 +134,7 @@ CORE_zgetrf_panel_diag( int m, int n, int h, int m0, cblas_zscal( m-h, CBLAS_SADDR( alpha ), L, 1 ); } - /* - * h is compared only to n, because if we are on the last column of a - * tile, m might be much smaller than n, and still we need to apply - * the geru call. If this is the diagonal tile, we will just look for - * the next maximum for nothing. - */ - if ( h < n ) { + if ( h < chameleon_min( m, n ) ) { /* Applying the update */ cblas_zgeru(CblasColMajor, m-h, n-h, CBLAS_SADDR(mzone), diff --git a/coreblas/compute/core_zlaswp.c b/coreblas/compute/core_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..e28c82b1785c778b704eccab116b2be81bd93859 --- /dev/null +++ b/coreblas/compute/core_zlaswp.c @@ -0,0 +1,223 @@ +/** + * + * @file core_zlaswp.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgetrf with partial pivoting CPU kernel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * @precisions normal z -> c d s + * + */ +#include "coreblas/lapacke.h" +#include "coreblas.h" + +/** + ****************************************************************************** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zlaswp_get extracts the rows from the tile B that have been selected as + * pivot into the tile A. + * + ******************************************************************************* + * + * @param[in] m0 + * The index of the first row of the tile A into the larger matrix it + * belongs to. + * + * @param[in] m + * The number of rows of the matrix A. + * + * @param[in] n + * The number of columns of the matrices A and B. + * + * @param[in] k + * The number of rows of the matrix B. This is the number of potential + * pivot that can be extracted from A. + * + * @param[in] A + * On entry, the matrix A of dimension lda-by-n where to extract the + * pivot rows if some are selected in the range m0..m0+m. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[inout] B + * On entry, a matrix of size ldb-by-n with 0s or already collected + * rows. + * On exit, B is filled with the selected rows from A, such that for + * each row i, B[i] = A[perm[i]-m0-1]. + * + * @param[in] ldb + * The leading dimension of the array B. ldb >= max(1,k). + * + * @param[in] perm + * The permutation array of dimension k. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CORE_zlaswp_get( int m0, int m, int n, int k, + const CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *B, int ldb, + const int *perm ) +{ + int i; + + /* Check input arguments */ + if (m0 < 0) { + coreblas_error(1, "Illegal value of m0"); + return -1; + } + if (m < 0) { + coreblas_error(2, "Illegal value of m"); + return -2; + } + if (n < 0) { + coreblas_error(3, "Illegal value of n"); + return -3; + } + if (k < 0) { + coreblas_error(4, "Illegal value of k"); + return -4; + } + if ((lda < chameleon_max(1,m)) && (m > 0)) { + coreblas_error(6, "Illegal value of lda"); + return -6; + } + if ((ldb < chameleon_max(1,k)) && (k > 0)) { + coreblas_error(8, "Illegal value of ldb"); + return -8; + } + + /* Quick return */ + if ((m == 0) || (n == 0) || (k == 0)) { + return CHAMELEON_SUCCESS; + } + + for( i=0; i<k; i++ ) + { + int idx = perm[i] - m0; + + if ( ( idx >= 0 ) && (idx < m ) ) + { + cblas_zcopy( n, A + idx, lda, + B + i, ldb ); + } + } + + return CHAMELEON_SUCCESS; +} + +/** + ****************************************************************************** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zlaswp_set copies the rows from the tile A into the tile B when they are + * the destination of the pivoted rows. + * + ******************************************************************************* + * + * @param[in] m0 + * The index of the first row of the tile B into the larger matrix it + * belongs to. + * + * @param[in] m + * The number of rows of the matrix B. + * + * @param[in] n + * The number of columns of the matrices A and B. + * + * @param[in] k + * The number of rows of the matrix A. This is the number of potential + * pivot that can be inserted into B. + * + * @param[in] A + * On entry, the matrix A of dimension lda-by-n where to read the + * pivoted rows. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,k). + * + * @param[inout] B + * On entry, a matrix of size ldb-by-n that may require some pivoted rows. + * On exit, B is updated with the pivoted rows it needs to receive, such that for + * each row i, A[i] = B[invp[i]-m0-1]. + * + * @param[in] ldb + * The leading dimension of the array B. ldb >= max(1,m). + * + * @param[in] invp + * The inverse permutation array of dimension k. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CORE_zlaswp_set( int m0, int m, int n, int k, + const CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *B, int ldb, + const int *invp ) +{ + int i; + + /* Check input arguments */ + if (m0 < 0) { + coreblas_error(1, "Illegal value of m0"); + return -1; + } + if (m < 0) { + coreblas_error(2, "Illegal value of m"); + return -2; + } + if (n < 0) { + coreblas_error(3, "Illegal value of n"); + return -3; + } + if (k < 0) { + coreblas_error(4, "Illegal value of k"); + return -4; + } + if ((lda < chameleon_max(1,k)) && (k > 0)) { + coreblas_error(6, "Illegal value of lda"); + return -6; + } + if ((ldb < chameleon_max(1,m)) && (m > 0)) { + coreblas_error(8, "Illegal value of ldb"); + return -8; + } + + /* Quick return */ + if ((m == 0) || (n == 0) || (k == 0)) { + return CHAMELEON_SUCCESS; + } + + for( i=0; i<k; i++ ) + { + int idx = invp[i] - m0; + + if ( ( idx >= 0 ) && (idx < m ) ) + { + cblas_zcopy( n, A + i, lda, + B + idx, ldb ); + } + } + + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 6de9b2c63cd546ddb8216fa0f5c0bdfd522fdb1d..0383290fa13b9b02bb321c939d73fa72720ad2ad 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -9,11 +9,11 @@ * * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one. * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-08-31 * @precisions normal z -> c d s * */ @@ -464,6 +464,24 @@ TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CH CORE_zlaset2( uplo, n1, n2, alpha, CHAM_tile_get_ptr( A ), A->ld ); } +int +TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm ) +{ + coreblas_kernel_trace( A, B ); + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return CORE_zlaswp_get( m0, m, n, k, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, perm ); +} + +int +TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp ) +{ + coreblas_kernel_trace( A, B ); + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return CORE_zlaswp_set( m0, m, n, k, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, invp ); +} + int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c index c5055a5084520992a023759b3126eb5171f0314a..036c8326311cb1646690e9becc3c539158e33036 100644 --- a/coreblas/compute/core_ztile_empty.c +++ b/coreblas/compute/core_ztile_empty.c @@ -9,10 +9,10 @@ * * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one. * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-08-31 * @precisions normal z -> c d s * */ @@ -62,6 +62,7 @@ TCORE_zaxpy( int M, coreblas_kernel_trace( A, B ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -74,6 +75,7 @@ TCORE_zgeadd( __attribute__((unused)) cham_trans_t trans, __attribute__((unused)) CHAM_tile_t * B ) { coreblas_kernel_trace( A, B ); + return 0; } int @@ -88,6 +90,7 @@ TCORE_zgelqt( __attribute__((unused)) int M, coreblas_kernel_trace( A, T ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } void @@ -129,6 +132,7 @@ TCORE_zgeqrt( int M, coreblas_kernel_trace( A, T ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -137,6 +141,7 @@ TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L coreblas_kernel_trace( L, A ); assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -145,6 +150,7 @@ TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile coreblas_kernel_trace( A, sclssq ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -242,6 +248,7 @@ TCORE_zherfb( cham_uplo_t uplo, assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -255,6 +262,7 @@ TCORE_zhessq( cham_store_t storev, coreblas_kernel_trace( A, sclssq ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } #endif @@ -348,6 +356,24 @@ TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CH assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); } +int +TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm ) +{ + coreblas_kernel_trace( A, B ); + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; +} + +int +TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp ) +{ + coreblas_kernel_trace( A, B ); + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; +} + int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, @@ -359,6 +385,7 @@ TCORE_zlatro( cham_uplo_t uplo, coreblas_kernel_trace( A, B ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } void @@ -435,6 +462,7 @@ TCORE_zssssm( int M1, assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( L1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( L2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } void @@ -496,6 +524,7 @@ TCORE_zsyssq( cham_store_t storev, coreblas_kernel_trace( A, sclssq ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -504,6 +533,7 @@ TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A ) { coreblas_kernel_trace( A ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } #endif @@ -521,6 +551,7 @@ TCORE_ztplqt( int M, assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -542,6 +573,7 @@ TCORE_ztpmlqt( cham_side_t side, assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -563,6 +595,7 @@ TCORE_ztpmqrt( cham_side_t side, assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -579,6 +612,7 @@ TCORE_ztpqrt( int M, assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -592,6 +626,7 @@ TCORE_ztradd( cham_uplo_t uplo, CHAM_tile_t * B ) { coreblas_kernel_trace( A, B ); + return 0; } void @@ -648,6 +683,7 @@ TCORE_ztrssq( cham_uplo_t uplo, coreblas_kernel_trace( A, sclssq ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } void @@ -677,6 +713,7 @@ TCORE_ztsmlq_hetra1( cham_side_t side, assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -700,6 +737,7 @@ TCORE_ztsmqr_hetra1( cham_side_t side, assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -719,6 +757,7 @@ TCORE_ztstrf( int M, assert( U->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -738,6 +777,7 @@ TCORE_zunmlq( cham_side_t side, assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -757,13 +797,15 @@ TCORE_zunmqr( cham_side_t side, assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); -} +} return 0; + int TCORE_zgesum( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sum ) { assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( sum->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -787,6 +829,7 @@ TCORE_zcesca( int center, assert( Di->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } int @@ -805,4 +848,5 @@ TCORE_zgram( cham_uplo_t uplo, assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( D->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return 0; } diff --git a/coreblas/include/coreblas.h b/coreblas/include/coreblas.h index f1c461f29b5e3c9ff55ce65caab1a129bda57c6c..10cc8cc280dc0145367b850b33670beffaf93c53 100644 --- a/coreblas/include/coreblas.h +++ b/coreblas/include/coreblas.h @@ -11,14 +11,14 @@ * * @brief Chameleon CPU kernels main header * - * @version 1.2.0 + * @version 1.3.0 * @author Jakub Kurzak * @author Hatem Ltaief * @author Florent Pruvost * @author Guillaume Sylvand * @author Mathieu Faverge * @author Raphael Boucherie - * @date 2022-02-22 + * @date 2023-08-31 * */ #ifndef _coreblas_h_ @@ -87,6 +87,8 @@ void __coreblas_kernel_trace( const char *func, ... ); #endif +void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp ); + END_C_DECLS #endif /* _coreblas_h_ */ diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 74382a8df5509d1ca8f2a9420fd7415a58e9dfc1..e5e68e2989f11f56517714e2692adb4ab95771a9 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -11,7 +11,7 @@ * * @brief Chameleon CPU CHAMELEON_Complex64_t kernels header * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak @@ -22,7 +22,7 @@ * @author Cedric Castagnede * @author Florent Pruvost * @author Matthieu Kuhn - * @date 2022-02-22 + * @date 2023-08-31 * @precisions normal z -> c d s * */ @@ -178,6 +178,14 @@ void CORE_zlaset2(cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *tileA, int ldtilea); void CORE_zlaswp(int N, CHAMELEON_Complex64_t *A, int LDA, int I1, int I2, const int *IPIV, int INC); +int CORE_zlaswp_get( int m0, int m, int n, int k, + const CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *B, int ldb, + const int *perm ); +int CORE_zlaswp_set( int m0, int m, int n, int k, + const CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *B, int ldb, + const int *invp ); int CORE_zlaswp_ontile( CHAM_desc_t descA, int i1, int i2, const int *ipiv, int inc); int CORE_zlaswpc_ontile(CHAM_desc_t descA, int i1, int i2, const int *ipiv, int inc); int CORE_zlatro(cham_uplo_t uplo, cham_trans_t trans, diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index 74cd413168d44653f27c408eee7dea77eca603ad..88d80d053b12e792beb61ea38d3a9171b3d8b2eb 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -7,11 +7,11 @@ * * @brief Chameleon CPU kernel CHAM_tile_t interface * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-08-31 * @precisions normal z -> c d s * */ @@ -54,6 +54,8 @@ void TCORE_zlantr( cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int int TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); void TCORE_zlaset( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_tile_t *A ); void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); +int TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm ); +int TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp ); int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ); #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h index a8aaaef56a42b2dbaa25664d86022c51c2f4cd09..2655ef34669642404a79a566db7f0020323f9c65 100644 --- a/include/chameleon/runtime.h +++ b/include/chameleon/runtime.h @@ -18,7 +18,7 @@ * @author Samuel Thibault * @author Philippe Swartvagher * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * */ #ifndef _chameleon_runtime_h_ @@ -710,9 +710,11 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ); void RUNTIME_ipiv_init ( CHAM_ipiv_t *ipiv ); void RUNTIME_ipiv_gather ( CHAM_ipiv_t *desc, int *ipiv, int node ); -void *RUNTIME_ipiv_getaddr ( CHAM_ipiv_t *ipiv, int m ); -void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ); -void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ); +void *RUNTIME_ipiv_getaddr ( const CHAM_ipiv_t *ipiv, int m ); +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ); +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ); +void *RUNTIME_perm_getaddr ( const CHAM_ipiv_t *ipiv, int m ); +void *RUNTIME_invp_getaddr ( const CHAM_ipiv_t *ipiv, int m ); static inline void * RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) { @@ -730,6 +732,8 @@ void RUNTIME_ipiv_flush ( const CHAM_ipiv_t *ipiv, const RUNTIME_sequence_t *sequence ); void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ws, int k, int h ); +void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, + const CHAM_ipiv_t *ipiv, int m ); /** * @} diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index d7dd07f48dc6fad1cff9359bf3410defdd657357..e995dcbe18321d0b9bfef2c375318cee8c336b1b 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -19,7 +19,7 @@ * @author Samuel Thibault * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-08-22 + * @date 2023-08-31 * */ #ifndef _chameleon_struct_h_ @@ -143,13 +143,17 @@ struct chameleon_desc_s { typedef struct chameleon_piv_s { const CHAM_desc_t *desc; /**> Reference descriptor to compute data mapping based on diagonal tiles, and get floating reference type */ - int *data; /**> Pointer to the data */ - void *ipiv; /**> Opaque array of pointers for the runtimes to handle the ipiv array */ - void *nextpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */ - void *prevpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */ + int *data; /**> Pointer to the data */ + void *ipiv; /**> Opaque array of pointers for the runtimes to handle the ipiv array */ + void *nextpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */ + void *prevpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */ + void *perm; /**> Opaque array of pointers for the runtimes to handle the temporary permutation array */ + void *invp; /**> Opaque array of pointers for the runtimes to handle the temporary inverse permutation array */ int64_t mpitag_ipiv; /**> Initial mpi tag values for the ipiv handles */ int64_t mpitag_nextpiv; /**> Initial mpi tag values for the nextpiv handles */ int64_t mpitag_prevpiv; /**> Initial mpi tag values for the prevpiv handles */ + int64_t mpitag_perm; /**> Initial mpi tag values for the nextpiv handles */ + int64_t mpitag_invp; /**> Initial mpi tag values for the prevpiv handles */ int i; /**> row index to the beginning of the submatrix */ int m; /**> The number of row in the vector ipiv */ int mb; /**> The number of row per block */ diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index bc7a59e6f0b36ae28602218d6579c1f47ddbf142..e4131d38409d99f13ffb05c50d4345cb4976078e 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -15,7 +15,8 @@ * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost - * @date 2023-07-06 + * @author Matthieu Kuhn + * @date 2023-08-31 * */ #ifndef _chameleon_tasks_h_ @@ -121,6 +122,10 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ); +void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, + int m0, int m, int k, + const CHAM_ipiv_t *ipivdesc, int ipivk ); + #include "chameleon/tasks_z.h" #include "chameleon/tasks_d.h" #include "chameleon/tasks_c.h" diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index c5704884e1e11331008519bff1a2b955fd6e4321..c5fdfdf4eb7bbbb1374dac51452782b37858c84b 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -24,7 +24,7 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * @precisions normal z -> c d s * */ @@ -186,6 +186,16 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *tileA, int tileAm, int tileAn ); +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *tIPIV, int tIPIVk, + const CHAM_desc_t *tileA, int tileAm, int tileAn, + const CHAM_desc_t *tileB, int tileBm, int tileBn ); +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *tIPIV, int tIPIVk, + const CHAM_desc_t *tileA, int tileAm, int tileAn, + const CHAM_desc_t *tileB, int tileBm, int tileBn ); void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index e63a4dd5e7203333b4890a2aa09c27f71fda66c4..f011e6d9693672653d8301a4743b314b65ed8d2b 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -24,7 +24,7 @@ # @author Florent Pruvost # @author Philippe Virouleau # @author Matthieu Kuhn -# @date 2023-08-22 +# @date 2023-08-31 # ### @@ -76,8 +76,9 @@ set(CODELETS_ZSRC codelets/codelet_zlanhe.c codelets/codelet_zlansy.c codelets/codelet_zlantr.c - codelets/codelet_zlaset2.c codelets/codelet_zlaset.c + codelets/codelet_zlaset2.c + codelets/codelet_zlaswp.c codelets/codelet_zlatro.c codelets/codelet_zlauum.c codelets/codelet_zplghe.c @@ -124,6 +125,7 @@ set(CODELETS_ZSRC set(CODELETS_SRC codelets/codelet_map.c + codelets/codelet_ipiv_to_perm.c ) # Check for the subdirectories diff --git a/runtime/openmp/codelets/codelet_ipiv_to_perm.c b/runtime/openmp/codelets/codelet_ipiv_to_perm.c new file mode 100644 index 0000000000000000000000000000000000000000..c2fb60bccb08f7b1eaf980b2a0810cbf628254e3 --- /dev/null +++ b/runtime/openmp/codelets/codelet_ipiv_to_perm.c @@ -0,0 +1,37 @@ +/** + * + * @file openmp/codelet_ipiv_to_perm.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon OpenMP codelets to convert pivot to permutations + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks.h" +#include "coreblas.h" + +void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, + int m0, int m, int k, + const CHAM_ipiv_t *ipivdesc, int ipivk ) +{ + int *ipiv = NULL; // get pointer from ipivdesc + int *perm = NULL; // get pointer from ipivdesc + int *invp = NULL; // get pointer from ipivdesc + +#pragma omp task firstprivate( m0, m, k ) depend( in:ipiv[0] ) depend( inout:perm[0] ) depend( inout:invp[0] ) + { + CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); + } + + (void)options; + (void)ipivk; +} diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..452b73926bc301f9f4f7a1d4ac10ace129a46490 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zlaswp.c @@ -0,0 +1,62 @@ +/** + * + * @file openmp/codelet_zlaswp.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon OpenMP codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-08-31 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *U, int Um, int Un ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileU = U->get_blktile( U, Um, Un ); + int *perm = NULL; // get perm from ipiv + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileU->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( m0, k, ipiv, tileA, tileU ) depend( in:perm ) depend( in:tileA[0] ) depend( inout:tileU[0] ) + { + TCORE_zlaswp_get( m0, A->m, A->n, k, tileA, tileU, perm ); + } + + (void)options; +} + +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + int *invp = NULL; // get invp from ipiv + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( m0, k, ipiv, tileA, tileB ) depend( in:invp ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + { + TCORE_zlaswp_set( m0, A->m, A->n, k, tileA, tileB, invp ); + } + + (void)options; +} diff --git a/runtime/openmp/control/runtime_descriptor_ipiv.c b/runtime/openmp/control/runtime_descriptor_ipiv.c index 03886ca650340279207c8163bc30eac81f4a1054..f10c4156d83f3e50d4b523f3942b0757475b913f 100644 --- a/runtime/openmp/control/runtime_descriptor_ipiv.c +++ b/runtime/openmp/control/runtime_descriptor_ipiv.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * */ #include "chameleon_openmp.h" @@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) (void)ipiv; } -void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) +void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) { assert( 0 ); (void)ipiv; @@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } -void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } +void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + +void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, const CHAM_ipiv_t *ipiv, int m ) { @@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t *ipiv, (void)sequence; } +void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, + const CHAM_ipiv_t *ipiv, int m ) +{ + assert( 0 ); + (void)sequence; + (void)ipiv; + (void)m; +} + void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h ) { diff --git a/runtime/parsec/codelets/codelet_ipiv_to_perm.c b/runtime/parsec/codelets/codelet_ipiv_to_perm.c new file mode 100644 index 0000000000000000000000000000000000000000..9a972d879ede836a34dd66017274cc50e71792ee --- /dev/null +++ b/runtime/parsec/codelets/codelet_ipiv_to_perm.c @@ -0,0 +1,50 @@ +/** + * + * @file parsec/codelet_ipiv_to_perm.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon Parsec codelets to convert pivot to permutations + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks.h" +#include "coreblas.h" + +static inline int +CORE_ipiv_to_perm_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + int m0, m, k; + int *ipiv, *perm, *invp; + + parsec_dtd_unpack_args( + this_task, &m0, &m, &k, &ipiv, &perm, &invp ); + + CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); +} + +void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, + int m0, int m, int k, + const CHAM_ipiv_t *ipivdesc, int ipivk ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_ipiv_to_perm_parsec, options->priority, "ipiv_to_perm", + sizeof(int), &m0, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &k, VALUE, + PASSED_BY_REF, RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_ipiv( ipivdesc ) | INPUT, + PASSED_BY_REF, RUNTIME_perm_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_perm( ipivdesc ) | OUTPUT, + PASSED_BY_REF, RUNTIME_invp_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_invp( ipivdesc ) | OUTPUT, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..284c450aaee61dd71603034b0531f0503de2de5a --- /dev/null +++ b/runtime/parsec/codelets/codelet_zlaswp.c @@ -0,0 +1,92 @@ +/** + * + * @file parsec/codelet_zlaswp.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon PaRSEC codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-08-31 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_zlaswp_get_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + int m0, m, n, k, lda, ldb, *perm; + CHAMELEON_Complex64_t *A, *B; + + parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &perm ); + + CORE_zlaswp_get( m0, m, n, k, A, lda, B, ldb, perm ); +} + +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *U, int Um, int Un ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileU = U->get_blktile( U, Um, Un ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zlaswp_get_parsec, options->priority, "laswp_get", + sizeof(int), &m0, VALUE, + sizeof(int), &(tileA->m), VALUE, + sizeof(int), &(tileA->n), VALUE, + sizeof(int), &k, VALUE, + PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR(U, ChamComplexDouble, Um, Un), chameleon_parsec_get_arena_index( U ) | INOUT, + sizeof(int), &(tileU->ld), VALUE, + PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT, + PARSEC_DTD_ARG_END ); +} + +static inline int +CORE_zlaswp_set_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + int m0, m, n, k, lda, ldb, *invp; + CHAMELEON_Complex64_t *A, *B; + + parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &invp ); + + CORE_zlaswp_set( m0, m, n, k, A, lda, B, ldb, invp ); +} + +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zlaswp_set_parsec, options->priority, "laswp_set", + sizeof(int), &m0, VALUE, + sizeof(int), &(tileB->m), VALUE, + sizeof(int), &(tileB->n), VALUE, + sizeof(int), &k, VALUE, + PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), chameleon_parsec_get_arena_index( B ) | INOUT, + sizeof(int), &(tileB->ld), VALUE, + PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/parsec/control/runtime_descriptor_ipiv.c b/runtime/parsec/control/runtime_descriptor_ipiv.c index 04a0b791139d5c6a247b25630e126d4a3eb467bf..fefb42abf9aaa65f98e2959bf09ca24779c95a7d 100644 --- a/runtime/parsec/control/runtime_descriptor_ipiv.c +++ b/runtime/parsec/control/runtime_descriptor_ipiv.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * */ #include "chameleon_parsec.h" @@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) (void)ipiv; } -void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) +void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) { assert( 0 ); (void)ipiv; @@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } -void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } +void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + +void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, const CHAM_ipiv_t *ipiv, int m ) { @@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t *ipiv, (void)sequence; } +void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, + const CHAM_ipiv_t *ipiv, int m ) +{ + assert( 0 ); + (void)sequence; + (void)ipiv; + (void)m; +} + void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h ) { diff --git a/runtime/parsec/include/chameleon_parsec.h b/runtime/parsec/include/chameleon_parsec.h index 30518fb809779ec0ba9c5ce45701a1187fb07621..23d19fd3f904ab28638d610e11abd564205fc924 100644 --- a/runtime/parsec/include/chameleon_parsec.h +++ b/runtime/parsec/include/chameleon_parsec.h @@ -11,12 +11,12 @@ * * @brief Chameleon PaRSEC runtime header * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Reazul Hoque * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-08-31 * */ #ifndef _chameleon_parsec_h_ @@ -38,10 +38,28 @@ struct chameleon_parsec_desc_s { typedef struct chameleon_parsec_desc_s chameleon_parsec_desc_t; static inline int -chameleon_parsec_get_arena_index(const CHAM_desc_t *desc) { +chameleon_parsec_get_arena_index( const CHAM_desc_t *desc ) { return ((chameleon_parsec_desc_t *)desc->schedopt)->arena_index; } +static inline int +chameleon_parsec_get_arena_index_ipiv( const CHAM_ipiv_t *ipiv ) { + assert(0); + return -1; +} + +static inline int +chameleon_parsec_get_arena_index_perm( const CHAM_ipiv_t *ipiv ) { + assert(0); + return -1; +} + +static inline int +chameleon_parsec_get_arena_index_invp( const CHAM_ipiv_t *ipiv ) { + assert(0); + return -1; +} + static inline int cham_to_parsec_access( cham_access_t accessA ) { if ( accessA == ChamR ) { return INPUT; diff --git a/runtime/quark/codelets/codelet_ipiv_to_perm.c b/runtime/quark/codelets/codelet_ipiv_to_perm.c new file mode 100644 index 0000000000000000000000000000000000000000..8ccc7ddff26438bfc67e8160c5bb4de8419237f4 --- /dev/null +++ b/runtime/quark/codelets/codelet_ipiv_to_perm.c @@ -0,0 +1,48 @@ +/** + * + * @file quark/codelet_ipiv_to_perm.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon Quark codelets to convert pivot to permutations + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks.h" +#include "coreblas.h" + +static inline void +CORE_ipiv_to_perm_quark( Quark *quark ) +{ + int m0, m, k; + int *ipiv, *perm, *invp; + + quark_unpack_args_6( quark, m0, m, k, ipiv, perm, invp ); + + CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); +} + +void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, + int m0, int m, int k, + const CHAM_ipiv_t *ipivdesc, int ipivk ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + + QUARK_Insert_Task( + opt->quark, CORE_ipiv_to_perm_quark, (Quark_Task_Flags*)opt, + sizeof(int), &m0, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &k, VALUE, + sizeof(int*), RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), INPUT, + sizeof(int*), RUNTIME_perm_getaddr( ipivdesc, ipivk ), OUTPUT, + sizeof(int*), RUNTIME_invp_getaddr( ipivdesc, ipivk ), OUTPUT, + 0 ); +} diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..117d6761882bcbd7222a807bc98d3a741d4712a0 --- /dev/null +++ b/runtime/quark/codelets/codelet_zlaswp.c @@ -0,0 +1,78 @@ +/** + * + * @file quark/codelet_zlaswp.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon Quark codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-08-31 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +static void CORE_zlaswp_get_quark( Quark *quark ) +{ + int m0, k, *perm; + CHAM_tile_t *A, *B; + + quark_unpack_args_5( quark, m0, k, perm, A, B ); + + TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm ); +} + +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *U, int Um, int Un ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_LASWP; + + QUARK_Insert_Task( + opt->quark, CORE_zlaswp_get_quark, (Quark_Task_Flags*)opt, + sizeof(int), &m0, VALUE, + sizeof(int), &k, VALUE, + sizeof(int*), RUNTIME_perm_getaddr( ipiv, ipivk ), INPUT, + sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, + sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT, + 0 ); +} + +static void CORE_zlaswp_set_quark( Quark *quark ) +{ + int m0, k, *invp; + CHAM_tile_t *A, *B; + + quark_unpack_args_5( quark, m0, k, invp, A, B ); + + TCORE_zlaswp_set( m0, A->m, A->n, k, A, B, invp ); +} + +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_LASWP; + + QUARK_Insert_Task( + opt->quark, CORE_zlaswp_set_quark, (Quark_Task_Flags*)opt, + sizeof(int), &m0, VALUE, + sizeof(int), &k, VALUE, + sizeof(int*), RUNTIME_invp_getaddr( ipiv, ipivk ), INPUT, + sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, + sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT, + 0 ); +} diff --git a/runtime/quark/control/runtime_descriptor_ipiv.c b/runtime/quark/control/runtime_descriptor_ipiv.c index 34706a55518f95f0e4b229a772534e3f062d05d2..88e8f886e8578f99e066868e6dfb2880fc4035d0 100644 --- a/runtime/quark/control/runtime_descriptor_ipiv.c +++ b/runtime/quark/control/runtime_descriptor_ipiv.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * */ #include "chameleon_quark.h" @@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) (void)ipiv; } -void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) +void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) { assert( 0 ); (void)ipiv; @@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) return NULL; } -void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } -void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { assert( 0 ); (void)ipiv; @@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return NULL; } +void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + +void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k ) +{ + assert( 0 ); + (void)ipiv; + (void)k; + return NULL; +} + void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, const CHAM_ipiv_t *ipiv, int m ) { @@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t *ipiv, (void)sequence; } +void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, + const CHAM_ipiv_t *ipiv, int m ) +{ + assert( 0 ); + (void)sequence; + (void)ipiv; + (void)m; +} + void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h ) { diff --git a/runtime/quark/include/chameleon_quark.h b/runtime/quark/include/chameleon_quark.h index 8e415b7c564c486fa9ffd9f4de9585adc3a9410e..bb454e22972857fae4dbd0840696edfd3a978e59 100644 --- a/runtime/quark/include/chameleon_quark.h +++ b/runtime/quark/include/chameleon_quark.h @@ -49,7 +49,7 @@ static inline int cham_to_quark_access( cham_access_t accessA ) { /* * Access to block pointer and leading dimension */ -#define RTBLKADDR( desc, type, m, n ) ( (type*)RUNTIME_data_getaddr( desc, m, n ) ) +#define RTBLKADDR( desc, type, m, n ) ( RUNTIME_data_getaddr( desc, m, n ) ) #define RUNTIME_BEGIN_ACCESS_DECLARATION diff --git a/runtime/starpu/codelets/codelet_ipiv_to_perm.c b/runtime/starpu/codelets/codelet_ipiv_to_perm.c new file mode 100644 index 0000000000000000000000000000000000000000..31183c11505a0f19fa3505691684c37810c0f10e --- /dev/null +++ b/runtime/starpu/codelets/codelet_ipiv_to_perm.c @@ -0,0 +1,69 @@ +/** + * + * @file starpu/codelet_ipiv_to_perm.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to convert pivot to permutations + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_ipiv_to_perm_cpu_func( void *descr[], void *cl_arg ) +{ + int m0, m, k; + int *ipiv, *perm, *invp; + + starpu_codelet_unpack_args( cl_arg, &m0, &m, &k ); + + ipiv = (int*)STARPU_VECTOR_GET_PTR(descr[0]); + perm = (int*)STARPU_VECTOR_GET_PTR(descr[1]); + invp = (int*)STARPU_VECTOR_GET_PTR(descr[2]); + + CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* +* Codelet definition +*/ +static struct starpu_codelet cl_ipiv_to_perm = { + .where = STARPU_CPU, +#if defined(CHAMELEON_SIMULATION) + .cpu_funcs[0] = (starpu_cpu_func_t)1, +#else + .cpu_funcs[0] = cl_ipiv_to_perm_cpu_func, +#endif + .nbuffers = 3, + .model = NULL, + .name = "ipiv_to_perm" +}; + +void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options, + int m0, int m, int k, + const CHAM_ipiv_t *ipivdesc, int ipivk ) +{ + struct starpu_codelet *codelet = &cl_ipiv_to_perm; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m0, sizeof(int), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_R, RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), + STARPU_W, RUNTIME_perm_getaddr( ipivdesc, ipivk ), + STARPU_W, RUNTIME_invp_getaddr( ipivdesc, ipivk ), + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, + 0 ); +} diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..2d8fc31d422fa39da13db5f2b2240cd7096d64e3 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -0,0 +1,108 @@ +/** + * + * @file starpu/codelet_zlaswp.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to apply zlaswp on a panel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-08-31 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg ) +{ + int m0, k, *perm; + CHAM_tile_t *A, *B; + + starpu_codelet_unpack_args( cl_arg, &m0, &k ); + + perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); + A = (CHAM_tile_t *) cti_interface_get( descr[1] ); + B = (CHAM_tile_t *) cti_interface_get( descr[2] ); + + TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm ); +} +#endif + +/* + * Codelet definition + */ +CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func ) + +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *U, int Um, int Un ) +{ + struct starpu_codelet *codelet = &cl_zlaswp_get; + + //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m0, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), + STARPU_PRIORITY, options->priority, + //STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + 0 ); +} + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg ) +{ + int m0, k, *invp; + CHAM_tile_t *A, *B; + + starpu_codelet_unpack_args( cl_arg, &m0, &k ); + + invp = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); + A = (CHAM_tile_t *) cti_interface_get( descr[1] ); + B = (CHAM_tile_t *) cti_interface_get( descr[2] ); + + TCORE_zlaswp_set( m0, B->m, B->n, k, A, B, invp ); +} +#endif + +/* + * Codelet definition + */ +CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func ) + +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + struct starpu_codelet *codelet = &cl_zlaswp_set; + + //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m0, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_PRIORITY, options->priority, + //STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + 0 ); +} diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c index 4131f7d6c79858624ed0b324f6785aebfb195d7e..efd5afb3637fb65cb8b0dd49acdf14a5c5bf83a1 100644 --- a/runtime/starpu/control/runtime_descriptor_ipiv.c +++ b/runtime/starpu/control/runtime_descriptor_ipiv.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-08-31 * */ #include "chameleon_starpu.h" @@ -23,10 +23,16 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) { assert( ipiv ); - - ipiv->ipiv = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) ); - ipiv->nextpiv = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) ); - ipiv->prevpiv = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) ); + starpu_data_handle_t *handles = calloc( 5 * ipiv->mt, sizeof(starpu_data_handle_t) ); + ipiv->ipiv = handles; + handles += ipiv->mt; + ipiv->nextpiv = handles; + handles += ipiv->mt; + ipiv->prevpiv = handles; + handles += ipiv->mt; + ipiv->perm = handles; + handles += ipiv->mt; + ipiv->invp = handles; #if defined(CHAMELEON_USE_MPI) /* * Book the number of tags required to describe pivot structure @@ -34,13 +40,15 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) */ { chameleon_starpu_tag_init(); - ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 3 ); + ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 5 ); if ( ipiv->mpitag_ipiv == -1 ) { chameleon_fatal_error("RUNTIME_ipiv_create", "Can't pursue computation since no more tags are available for ipiv structure"); return; } ipiv->mpitag_nextpiv = ipiv->mpitag_ipiv + ipiv->mt; ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + ipiv->mt; + ipiv->mpitag_perm = ipiv->mpitag_prevpiv + ipiv->mt; + ipiv->mpitag_invp = ipiv->mpitag_perm + ipiv->mt; } #endif } @@ -51,37 +59,26 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv ) void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv ) { int i; - starpu_data_handle_t *ipiv_handle = (starpu_data_handle_t*)(ipiv->ipiv); - starpu_data_handle_t *nextpiv_handle = (starpu_data_handle_t*)(ipiv->nextpiv); - starpu_data_handle_t *prevpiv_handle = (starpu_data_handle_t*)(ipiv->prevpiv); - - for(i=0; i<ipiv->mt; i++) { - if ( *ipiv_handle != NULL ) { - starpu_data_unregister( *ipiv_handle ); - *ipiv_handle = NULL; - } - ipiv_handle++; - - if ( *nextpiv_handle != NULL ) { - starpu_data_unregister( *nextpiv_handle ); - *nextpiv_handle = NULL; - } - nextpiv_handle++; + starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv); - if ( *prevpiv_handle != NULL ) { - starpu_data_unregister( *prevpiv_handle ); - *prevpiv_handle = NULL; + for(i=0; i<(5 * ipiv->mt); i++) { + if ( *handle != NULL ) { + starpu_data_unregister( *handle ); + *handle = NULL; } - prevpiv_handle++; + handle++; } free( ipiv->ipiv ); - free( ipiv->nextpiv ); - free( ipiv->prevpiv ); + ipiv->ipiv = NULL; + ipiv->nextpiv = NULL; + ipiv->prevpiv = NULL; + ipiv->perm = NULL; + ipiv->invp = NULL; chameleon_starpu_tag_release( ipiv->mpitag_ipiv ); } -void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) +void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) { starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv); int64_t mm = m + (ipiv->i / ipiv->mb); @@ -110,7 +107,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m ) return *handle; } -void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { starpu_data_handle_t *nextpiv = (starpu_data_handle_t*)(ipiv->nextpiv); int64_t mm = m + (ipiv->i / ipiv->mb); @@ -124,7 +121,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) const CHAM_desc_t *A = ipiv->desc; int owner = A->get_rankof( A, m, m ); - int ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb; + int ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb; int64_t tag = ipiv->mpitag_nextpiv + mm; cppi_register( nextpiv, A->dtyp, ncols, tag, owner ); @@ -133,7 +130,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return *nextpiv; } -void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) +void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h ) { starpu_data_handle_t *prevpiv = (starpu_data_handle_t*)(ipiv->prevpiv); int64_t mm = m + (ipiv->i / ipiv->mb); @@ -147,7 +144,7 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) const CHAM_desc_t *A = ipiv->desc; int owner = A->get_rankof( A, m, m ); - int ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb; + int ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb; int64_t tag = ipiv->mpitag_prevpiv + mm; cppi_register( prevpiv, A->dtyp, ncols, tag, owner ); @@ -156,6 +153,64 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) return *prevpiv; } +void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int m ) +{ + starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->perm); + int64_t mm = m + (ipiv->i / ipiv->mb); + + handle += mm; + assert( handle ); + + if ( *handle != NULL ) { + return *handle; + } + + const CHAM_desc_t *A = ipiv->desc; + int owner = A->get_rankof( A, m, m ); + int ncols = ipiv->mb; + + starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) ); + +#if defined(CHAMELEON_USE_MPI) + { + int64_t tag = ipiv->mpitag_perm + mm; + starpu_mpi_data_register( *handle, tag, owner ); + } +#endif /* defined(CHAMELEON_USE_MPI) */ + + assert( *handle ); + return *handle; +} + +void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m ) +{ + starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->invp); + int64_t mm = m + (ipiv->i / ipiv->mb); + + handle += mm; + assert( handle ); + + if ( *handle != NULL ) { + return *handle; + } + + const CHAM_desc_t *A = ipiv->desc; + int owner = A->get_rankof( A, m, m ); + int ncols = ipiv->mb; + + starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) ); + +#if defined(CHAMELEON_USE_MPI) + { + int64_t tag = ipiv->mpitag_invp + mm; + starpu_mpi_data_register( *handle, tag, owner ); + } +#endif /* defined(CHAMELEON_USE_MPI) */ + + assert( *handle ); + return *handle; +} + void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence, const CHAM_ipiv_t *ipiv, int m ) { @@ -205,6 +260,44 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t *ipiv, } } +void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, + const CHAM_ipiv_t *ipiv, int m ) +{ + starpu_data_handle_t *handle; + const CHAM_desc_t *A = ipiv->desc; + int64_t mm = m + ( ipiv->i / ipiv->mb ); + + handle = (starpu_data_handle_t*)(ipiv->perm); + handle += mm; + + if ( *handle != NULL ) { +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle ); + if ( starpu_mpi_data_get_rank( *handle ) == A->myrank ) +#endif + { + chameleon_starpu_data_wont_use( *handle ); + } + } + + handle = (starpu_data_handle_t*)(ipiv->invp); + handle += mm; + + if ( *handle != NULL ) { +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle ); + if ( starpu_mpi_data_get_rank( *handle ) == A->myrank ) +#endif + { + chameleon_starpu_data_wont_use( *handle ); + } + } + + (void)sequence; + (void)ipiv; + (void)m; +} + void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, int k, int h ) { @@ -276,14 +369,38 @@ void RUNTIME_ipiv_gather( CHAM_ipiv_t *desc, int *ipiv, int node ) int64_t mb = desc->mb; int64_t tag = chameleon_starpu_tag_book( (int64_t)(desc->mt) ); int rank = CHAMELEON_Comm_rank(); + int owner = rank; int m; for (m = 0; m < mt; m++, ipiv += mb) { starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( desc, m ); #if defined(CHAMELEON_USE_MPI) - if ( (rank == node) || - (rank == starpu_mpi_data_get_rank(ipiv_src)) ) + owner = starpu_mpi_data_get_rank( ipiv_src ); + if ( node != owner ) { + starpu_mpi_tag_t tag = starpu_mpi_data_get_tag( ipiv_src ); + + if ( rank == node ) + { + /* Need to receive the data */ + int already_received = starpu_mpi_cached_receive_set( ipiv_src ); + if (already_received == 0) + { + MPI_Status status; + starpu_mpi_recv( ipiv_src, owner, tag, MPI_COMM_WORLD, &status ); + } + } + else if ( rank == owner ) + { + /* Need to send the data */ + int already_sent = starpu_mpi_cached_send_set( ipiv_src, node ); + if (already_sent == 0) + { + starpu_mpi_send( ipiv_src, node, tag, MPI_COMM_WORLD ); + } + } + } + if ( rank == node ) #endif { starpu_data_handle_t ipiv_dst; diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h index 537bc9cd807c9e27f0cf550d6611e2bc974255d3..7a77784656291b3f2b91ccd265f950a7d8889d8d 100644 --- a/runtime/starpu/include/cppi_interface.h +++ b/runtime/starpu/include/cppi_interface.h @@ -82,8 +82,11 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title ) } #else static inline void -cppi_display_dbg( cppi_interface_t *, FILE *, const char * ) +cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title ) { + (void)cppi_interface; + (void)f; + (void)title; return; } #endif diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index c27d6b913bb231c4815dca09e67b7201e12697c7..72c7edc8cb46cc035e693bd6c653715d91990495 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -27,6 +27,8 @@ #include "runtime_codelet_profile.h" #if !defined(CHAMELEON_SIMULATION) +#include "coreblas.h" + #if defined(CHAMELEON_USE_CUDA) #include "gpucublas.h" #endif diff --git a/runtime/starpu/include/runtime_mpi.h b/runtime/starpu/include/runtime_mpi.h index 6d307bc6ae597ec075caf05c7dcbd382a16c4043..cd9841e10d8994c3652bd1a968df3be6841a7937 100644 --- a/runtime/starpu/include/runtime_mpi.h +++ b/runtime/starpu/include/runtime_mpi.h @@ -23,7 +23,7 @@ #if defined(CHAMELEON_USE_MPI) #if !defined(HAVE_STARPU_MPI_DATA_REGISTER) -static inline starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner ) +static inline void starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner ) { starpu_data_set_rank( handle, owner ); starpu_data_set_tag( handle, tag ); @@ -32,8 +32,11 @@ static inline starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag #else -static inline starpu_mpi_data_register( starpu_data_handle_t, int64_t, int ) +static inline void starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner ) { + (void)handle; + (void)tag; + (void)owner; } #endif