diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index a97a9cd37b150a32cedfeed7e244334e9f659891..6dfa83891f9f2ded3dfb3252e5365d375508f3c6 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -27,7 +27,7 @@ # @author Alycia Lisito # @author Loris Lucido # @author Matthieu Kuhn -# @date 2023-08-22 +# @date 2024-04-03 # ### @@ -261,6 +261,16 @@ set(ZSRC zgenm2.c pzgenm2.c zprint.c + ################## + # Batch + ################## + zplghe_batch.c + # + zgemm_batch.c + zherk_batch.c + ztrsm_batch.c + # + zpotrf_batch.c ) precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}" diff --git a/compute/zgeadd.c b/compute/zgeadd.c index 7011aef88fbf25ccaad62bb09e6236b3883cae3f..441753c99be954ab961b2d0f38de57d96b12c728 100644 --- a/compute/zgeadd.c +++ b/compute/zgeadd.c @@ -11,12 +11,12 @@ * * @brief Chameleon zgeadd wrappers * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Florent Pruvost - * @date 2022-02-22 + * @date 2024-04-03 * @precisions normal z -> s d c * */ @@ -30,7 +30,7 @@ * CHAMELEON_zgeadd - Performs a matrix addition similarly to the pzgeadd() * function from the PBLAS library: * - * \f[ C = \alpha op( A ) + \beta B \f], + * \f[ B = \alpha op( A ) + \beta B \f], * * where op( X ) is one of * diff --git a/compute/zgemm_batch.c b/compute/zgemm_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..ab1ed4062062172c2d63b517485246763ac1e821 --- /dev/null +++ b/compute/zgemm_batch.c @@ -0,0 +1,228 @@ +/** + * + * @file zgemm_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon batch zgemm wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#if !defined(CHAMELEON_SIMULATION) +#include "coreblas/coreblas_ztile.h" +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas/gpucublas_z.h" +#endif +#endif + +struct zgemm_batch_args_s { + cham_trans_t transA; + cham_trans_t transB; + CHAMELEON_Complex64_t alpha; + CHAMELEON_Complex64_t beta; +}; +typedef struct zgemm_batch_args_s zgemm_batch_args_t; + +#if !defined(CHAMELEON_SIMULATION) +static inline int +zgemm_batch_cpu( void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + zgemm_batch_args_t *args = (zgemm_batch_args_t*)op_args; + const CHAM_desc_t *descB; + CHAM_tile_t *tileB; + const CHAM_desc_t *descC; + CHAM_tile_t *tileC; + va_list ap; + int tempmm, tempnn, tempkk; + + if ( ndata != 3 ) { + fprintf( stderr, "zgemm_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 3 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descB = va_arg(ap, const CHAM_desc_t *); + tileB = va_arg(ap, CHAM_tile_t *); + descC = va_arg(ap, const CHAM_desc_t *); + tileC = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + if ( args->transA == ChamNoTrans ) { + tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + } + else { + tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + } + + TCORE_zgemm( + args->transA, args->transB, tempmm, tempnn, tempkk, + args->alpha, tileA, tileB, args->beta, tileC ); + + (void)descB; + (void)uplo; + + return 0; +} +#else +#define zgemm_batch_cpu NULL +#endif + +#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA) +static inline int +zgemm_batch_cuda( cublasHandle_t handle, void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + zgemm_batch_args_t *args = (zgemm_batch_args_t*)op_args; + const CHAM_desc_t *descB; + CHAM_tile_t *tileB; + const CHAM_desc_t *descC; + CHAM_tile_t *tileC; + va_list ap; + int tempmm, tempnn, tempkk; + + if ( ndata != 3 ) { + fprintf( stderr, "zgemm_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 3 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descB = va_arg(ap, const CHAM_desc_t *); + tileB = va_arg(ap, CHAM_tile_t *); + descC = va_arg(ap, const CHAM_desc_t *); + tileC = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + if ( args->transA == ChamNoTrans ) { + tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + } + else { + tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + } + + CUDA_zgemm( args->transA, args->transB, tempmm, tempnn, tempkk, + (cuDoubleComplex*)&(args->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(args->beta), + tileC->mat, tileC->ld, + handle ); + + (void)descB; + (void)uplo; + + return 0; +} +#else +#define zgemm_batch_cuda NULL +#endif + +static cham_map_operator_t zgemm_batch_map = { + .name = "zgemm", + .cpufunc = zgemm_batch_cpu, + .cudafunc = zgemm_batch_cuda, + .hipfunc = NULL, +}; + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zgemm_batch_Tile - Performs multiple matrix multiplication in parallel. + * + ******************************************************************************* + * + * @param[in] transA + * Specifies whether the tiles from A are transposed, not transposed or conjugate transposed: + * = ChamNoTrans: tiles from A are not transposed; + * = ChamTrans: tiles from A are transposed; + * = ChamConjTrans: tiles from A are conjugate transposed. + * + * @param[in] transB + * Specifies whether the tiles from B are transposed, not transposed or conjugate transposed: + * = ChamNoTrans: tiles from B are not transposed; + * = ChamTrans: tiles from B are transposed; + * = ChamConjTrans: tiles from B are conjugate transposed. + * + * @param[in] alpha + * alpha specifies the scalar alpha + * + * @param[in] A + * A is a collection of mt-by-nt tiles of size A->mb by A->nb + * + * @param[in] B + * B is a collection of mt-by-nt tiles of size B->mb by B->nb + * + * @param[in] beta + * beta specifies the scalar beta + * + * @param[in,out] C + * C is a collection of mt-by-nt tiles of size C->mb by C->nb + * On exit, each tile Cij is overwritten by the matrix: + * \f[ alpha * op( A[i,j] )*op( B[i,j] ) * C[i,j] \f] + * + ******************************************************************************* + * + * @return CHAMELEON_SUCCESS on successful exit + * @return CHAMELEON_ERR_... on error + * + */ +int CHAMELEON_zgemm_batch_Tile( cham_trans_t transA, cham_trans_t transB, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + cham_map_data_t data[3]; + zgemm_batch_args_t params = { transA, transB, alpha, beta }; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgemm_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + data[0].access = ChamR; + data[0].desc = A; + data[1].access = ChamR; + data[1].desc = B; + data[2].access = ( beta == 0. ) ? ChamW : ChamRW; + data[2].desc = C; + + chameleon_pmap( ChamUpperLower, 3, data, &zgemm_batch_map, ¶ms, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + + return status; +} diff --git a/compute/zherk_batch.c b/compute/zherk_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..bff967e9f0afe0b4d2242ef8184a0574f3f96501 --- /dev/null +++ b/compute/zherk_batch.c @@ -0,0 +1,208 @@ +/** + * + * @file zherk_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon batch zherk wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#if !defined(CHAMELEON_SIMULATION) +#include "coreblas/coreblas_ztile.h" +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas/gpucublas_z.h" +#endif +#endif + +struct zherk_batch_args_s { + cham_uplo_t uplo; + cham_trans_t trans; + double alpha; + double beta; +}; +typedef struct zherk_batch_args_s zherk_batch_args_t; + +#if !defined(CHAMELEON_SIMULATION) +static inline int +zherk_batch_cpu( void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + zherk_batch_args_t *args = (zherk_batch_args_t*)op_args; + const CHAM_desc_t *descC; + CHAM_tile_t *tileC; + va_list ap; + int tempnn, tempkk; + + if ( ndata != 2 ) { + fprintf( stderr, "zherk_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 2 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descC = va_arg(ap, const CHAM_desc_t *); + tileC = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + if ( args->trans == ChamNoTrans ) { + tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + } + else { + tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + } + + TCORE_zherk( + args->uplo, args->trans, tempnn, tempkk, + args->alpha, tileA, args->beta, tileC ); + + (void)uplo; + + return 0; +} +#else +#define zherk_batch_cpu NULL +#endif + +#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA) +static inline int +zherk_batch_cuda( cublasHandle_t handle, void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + zherk_batch_args_t *args = (zherk_batch_args_t*)op_args; + const CHAM_desc_t *descC; + CHAM_tile_t *tileC; + va_list ap; + int tempnn, tempkk; + + if ( ndata != 2 ) { + fprintf( stderr, "zherk_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 2 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descC = va_arg(ap, const CHAM_desc_t *); + tileC = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + if ( args->trans == ChamNoTrans ) { + tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + } + else { + tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + } + + CUDA_zherk( args->uplo, args->trans, tempnn, tempkk, + &(args->alpha), tileA->mat, tileA->ld, + &(args->beta), tileC->mat, tileC->ld, + handle ); + + (void)uplo; + + return 0; +} +#else +#define zherk_batch_cuda NULL +#endif + +static cham_map_operator_t zherk_batch_map = { + .name = "zherk", + .cpufunc = zherk_batch_cpu, + .cudafunc = zherk_batch_cuda, + .hipfunc = NULL, +}; + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zherk_batch_Tile - Performs multiple rank-k update in parallel. + * + ******************************************************************************* + * + * @param[in] uplo + * = ChamUpper: Upper triangle of C tiles are stored; + * = ChamLower: Lower triangle of C tiles are stored. + * + * @param[in] trans + * Specifies whether the tiles of A are transposed or conjugate transposed: + * = ChamNoTrans: tiles of A are not transposed; + * = ChamConjTrans: tiles of A are conjugate transposed. + * + * @param[in] alpha + * alpha specifies the scalar alpha + * + * @param[in] A + * A is a collection of mt-by-nt tiles of size A->mb by A->nb + * + * @param[in] beta + * beta specifies the scalar beta + * + * @param[in,out] B + * B is a collection of mt-by-nt tiles of size B->mb by B->nb + * On exit, each tile B[i,j] is overwritten by + * \f[ B = \alpha [ op( A ) \times conjg( op( A )' )] + \beta B \f], + * + * where op( X ) is one of + * + * op( X ) = X or op( X ) = conjg( X' ) + * + ******************************************************************************* + * + * @return CHAMELEON_SUCCESS on successful exit + * @return CHAMELEON_ERR_... on error + * + */ +int CHAMELEON_zherk_batch_Tile( cham_uplo_t uplo, cham_trans_t trans, + double alpha, CHAM_desc_t *A, + double beta, CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + cham_map_data_t data[2]; + zherk_batch_args_t params = { uplo, trans, alpha, beta }; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zherk_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + data[0].access = ChamR; + data[0].desc = A; + data[1].access = ( beta == 0. ) ? ChamW : ChamRW; + data[1].desc = C; + + chameleon_pmap( ChamUpperLower, 2, data, &zherk_batch_map, ¶ms, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + + return status; +} diff --git a/compute/zplghe_batch.c b/compute/zplghe_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..92a0a071ee77dbe136e91241a0226fe0e4d21f2f --- /dev/null +++ b/compute/zplghe_batch.c @@ -0,0 +1,133 @@ +/** + * + * @file zplghe_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon batch zplghe wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#if !defined(CHAMELEON_SIMULATION) +#include "coreblas/coreblas_ztile.h" +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas/gpucublas_z.h" +#endif +#endif + +struct zplghe_batch_args_s { + double bump; + unsigned long long int seed; +}; +typedef struct zplghe_batch_args_s zplghe_batch_args_t; + +#if !defined(CHAMELEON_SIMULATION) +static inline int +zplghe_batch_cpu( void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + zplghe_batch_args_t *args = (zplghe_batch_args_t*)op_args; + int tempnn, m0; + + if ( ndata != 1 ) { + fprintf( stderr, "zplghe_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + } + + tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + + /** + * Let's fo the math to give the right bigM: + * jump for the first value is defined as j = m0 + n0 * bigM + * We need to set (m*n) matrices of size A->mb*A->nb, and we want j, m0, n0 to be defined as: + * j = m0 = n0 = (n * A->mt + m) * (A->mb * A->nb) + * Thus: + * bigM = 0; + */ + m0 = ( n * descA->mt + m ) * (descA->mb * descA->nb ); + TCORE_zplghe( args->bump, tempnn, tempnn, tileA, + 0, m0, m0, args->seed ); + + (void)uplo; + + return 0; +} +#else +#define zplghe_batch_cpu NULL +#endif + +static cham_map_operator_t zplghe_batch_map = { + .name = "zplghe", + .cpufunc = zplghe_batch_cpu, + .cudafunc = NULL, + .hipfunc = NULL, +}; + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zplghe_batch_Tile - Performs multiple hermitian matrix generation + * in parallel. + * + ******************************************************************************* + * + * @param[in] bump + * The value to add to the diagonal of each tile to be sure + * they are positive definite matrices. + * + * @param[in] seed + * The seed used in the random generation. + * + * @param[in,out] A + * A is a collection of mt-by-nt tiles of size A->mb by A->nb + * On exit, each tile is initialized as an hermitian matrix. + * + ******************************************************************************* + * + * @return CHAMELEON_SUCCESS on successful exit + * @return CHAMELEON_ERR_... on error + * + */ +int CHAMELEON_zplghe_batch_Tile( double bump, CHAM_desc_t *A, + unsigned long long int seed ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + cham_map_data_t data[1]; + zplghe_batch_args_t params = { bump, seed }; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zplghe_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + data[0].access = ChamW; + data[0].desc = A; + + chameleon_pmap( ChamUpperLower, 1, data, + &zplghe_batch_map, ¶ms, + sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + + return status; +} diff --git a/compute/zpotrf_batch.c b/compute/zpotrf_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..983a9d92fa3ba72645b47d0dbe78dd1346ad59f2 --- /dev/null +++ b/compute/zpotrf_batch.c @@ -0,0 +1,117 @@ +/** + * + * @file zpotrf_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon batch zpotrf wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#if !defined(CHAMELEON_SIMULATION) +#include "coreblas/coreblas_ztile.h" +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas/gpucublas_z.h" +#endif +#endif + +#if !defined(CHAMELEON_SIMULATION) +static inline int +zpotrf_batch_cpu( void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + cham_uplo_t luplo = (intptr_t)op_args; + int info = 0; + int tempnn; + + if ( ndata != 1 ) { + fprintf( stderr, "zpotrf_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + } + + tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + + TCORE_zpotrf( + luplo, tempnn, tileA, &info ); + + if ( info != 0 ) { + fprintf( stderr, "zpotrf_batch_cpu: Failed to correctly factorize the tile (info = %d)\n", info ); + } + + (void)uplo; + + return 0; +} +#else +#define zpotrf_batch_cpu NULL +#endif + +static cham_map_operator_t zpotrf_batch_map = { + .name = "zpotrf", + .cpufunc = zpotrf_batch_cpu, + .cudafunc = NULL, + .hipfunc = NULL, +}; + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zpotrf_batch_Tile - Performs multiple Cholesky factorization in parallel. + * + ******************************************************************************* + * + * @param[in] uplo + * = ChamUpper: Upper triangle of A tiles are stored; + * = ChamLower: Lower triangle of A tiles are stored. + * + * @param[in,out] A + * A is a collection of mt-by-nt tiles of size A->mb by A->nb + * On exit, each tile is factorized with Cholesky factorization. + * + ******************************************************************************* + * + * @return CHAMELEON_SUCCESS on successful exit + * @return CHAMELEON_ERR_... on error + * + */ +int CHAMELEON_zpotrf_batch_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + cham_map_data_t data[1]; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zpotrf_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + data[0].access = ChamRW; + data[0].desc = A; + + chameleon_pmap( ChamUpperLower, 1, data, + &zpotrf_batch_map, (void*)((intptr_t)uplo), + sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + + return status; +} diff --git a/compute/ztrsm_batch.c b/compute/ztrsm_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..9d426737b3bc3678892e452173ae05cfe5a26298 --- /dev/null +++ b/compute/ztrsm_batch.c @@ -0,0 +1,212 @@ +/** + * + * @file ztrsm_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon batch ztrsm wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#if !defined(CHAMELEON_SIMULATION) +#include "coreblas/coreblas_ztile.h" +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas/gpucublas_z.h" +#endif +#endif + +struct ztrsm_batch_args_s { + cham_side_t side; + cham_uplo_t uplo; + cham_trans_t transA; + cham_diag_t diag; + CHAMELEON_Complex64_t alpha; +}; +typedef struct ztrsm_batch_args_s ztrsm_batch_args_t; + +#if !defined(CHAMELEON_SIMULATION) +static inline int +ztrsm_batch_cpu( void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + ztrsm_batch_args_t *args = (ztrsm_batch_args_t*)op_args; + const CHAM_desc_t *descB; + CHAM_tile_t *tileB; + va_list ap; + int tempmm, tempnn; + + if ( ndata != 2 ) { + fprintf( stderr, "ztrsm_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 2 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descB = va_arg(ap, const CHAM_desc_t *); + tileB = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + tempmm = m == descB->mt-1 ? descB->m - m * descB->mb : descB->mb; + tempnn = n == descB->nt-1 ? descB->n - n * descB->nb : descB->nb; + + TCORE_ztrsm( + args->side, args->uplo, args->transA, args->diag, + tempmm, tempnn, args->alpha, tileA, tileB ); + + (void)descA; + (void)descB; + (void)uplo; + + return 0; +} +#else +#define ztrsm_batch_cpu NULL +#endif + +#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA) +static inline int +ztrsm_batch_cuda( cublasHandle_t handle, void *op_args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... ) +{ + ztrsm_batch_args_t *args = (ztrsm_batch_args_t*)op_args; + const CHAM_desc_t *descB; + CHAM_tile_t *tileB; + va_list ap; + int tempmm, tempnn; + + if ( ndata != 2 ) { + fprintf( stderr, "ztrsm_batch_cpu: requires two pieces of data and %d have been given\n", ndata ); + if ( ndata < 2 ) { + return -1; + } + } + + /* Get the second desc */ + va_start(ap, tileA); + descB = va_arg(ap, const CHAM_desc_t *); + tileB = va_arg(ap, CHAM_tile_t *); + va_end(ap); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + tempmm = m == descB->mt-1 ? descB->m - m * descB->mb : descB->mb; + tempnn = n == descB->nt-1 ? descB->n - n * descB->nb : descB->nb; + + CUDA_ztrsm( + args->side, args->uplo, args->transA, args->diag, + tempmm, tempnn, (cuDoubleComplex*)&(args->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + (void)descA; + (void)descB; + (void)uplo; + + return 0; +} +#else +#define ztrsm_batch_cuda NULL +#endif + +static cham_map_operator_t ztrsm_batch_map = { + .name = "ztrsm", + .cpufunc = ztrsm_batch_cpu, + .cudafunc = ztrsm_batch_cuda, + .hipfunc = NULL, +}; + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_ztrsm_batch_Tile - Performs multiple triangular solves in parallel. + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether tiles of A appears on the left or on the right of tiles of X: + * = ChamLeft: A[i,j] * X[i,j] = B[i,j] + * = ChamRight: X[i,j] * A[i,j] = B[i,j] + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or lower triangular: + * = ChamUpper: Upper triangle of tiles of A are stored; + * = ChamLower: Lower triangle of tiles of A are stored. + * + * @param[in] trans + * Specifies whether the matrix A is transposed, not transposed or conjugate transposed: + * = ChamNoTrans: tiles of A are transposed; + * = ChamTrans: tiles of A are not transposed; + * = ChamConjTrans: tiles of A are conjugate transposed. + * + * @param[in] diag + * Specifies whether or not A is unit triangular: + * = ChamNonUnit: tiles of A are non unit; + * = ChamUnit: tiles of A are unit. + * + * @param[in] alpha + * alpha specifies the scalar alpha + * + * @param[in] A + * A is a collection of mt-by-nt tiles of size A->mb by A->nb + * + * @param[in,out] B + * B is a collection of mt-by-nt tiles of size B->mb by B->nb + * On exit, each tile B[i,j] is overwritten by X[i,j] + * + ******************************************************************************* + * + * @return CHAMELEON_SUCCESS on successful exit + * @return CHAMELEON_ERR_... on error + * + */ +int CHAMELEON_ztrsm_batch_Tile( cham_side_t side, cham_uplo_t uplo, + cham_trans_t trans, cham_diag_t diag, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + cham_map_data_t data[2]; + ztrsm_batch_args_t params = { side, uplo, trans, diag, alpha }; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_ztrsm_batch_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + data[0].access = ChamR; + data[0].desc = A; + data[1].access = ChamRW; + data[1].desc = B; + + chameleon_pmap( ChamUpperLower, 2, data, &ztrsm_batch_map, ¶ms, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + + return status; +} diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 166feeef619018307a7f0873d0dc3ccf1b4983b4..5d667cca39e1fe42eb61d29257ac45e38e2f3075 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -23,7 +23,7 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2024-03-14 + * @date 2024-04-03 * @precisions normal z -> c d s * */ @@ -394,6 +394,15 @@ int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, void *user_ws, int CHAMELEON_zprint( FILE *file, const char *header, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, int LDA ); int CHAMELEON_zprint_Tile( FILE *file, const char *header, cham_uplo_t uplo, CHAM_desc_t *descA ); +/** + * Batch function prototypes - Tile interface + */ +int CHAMELEON_zgemm_batch_Tile( cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C ); +int CHAMELEON_zherk_batch_Tile( cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C ); +int CHAMELEON_zplghe_batch_Tile( double bump, CHAM_desc_t *A, unsigned long long int seed ); +int CHAMELEON_zpotrf_batch_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); +int CHAMELEON_ztrsm_batch_Tile( cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, cham_diag_t diag, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B ); + END_C_DECLS #endif /* _chameleon_z_h_ */ diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index c9fc1b73189bbfe15f3f301911a533c360ad55ec..aed35c62f6dec5ca0b753784bb7432ff3feda450 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -25,7 +25,7 @@ # @author Florent Pruvost # @author Alycia Lisito # @author Matthieu Kuhn -# @date 2022-02-22 +# @date 2024-04-03 # ### @@ -105,7 +105,14 @@ set(ZSRC_WO_STDAPI testing_zplrnk.c testing_zcesca.c testing_zgram.c -) + # + # Batch kernels + # + testing_zgemm_batch.c + testing_zherk_batch.c + testing_zpotrf_batch.c + testing_ztrsm_batch.c + ) foreach(_precision ${CHAMELEON_PRECISION} ) diff --git a/testing/testing_zgemm_batch.c b/testing/testing_zgemm_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..ed335643adebc62f34e09494438b0dd2c71fd7aa --- /dev/null +++ b/testing/testing_zgemm_batch.c @@ -0,0 +1,138 @@ +/** + * + * @file testing_zgemm_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemm_batch testing + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include <chameleon_lapack.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION) +#include <coreblas.h> +#endif + +static cham_fixdbl_t +flops_zgemm_batch( int nb, int M, int N, int K ) +{ + return flops_zgemm( M, N, K ) * nb; +} + +int +testing_zgemm_batch_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_int( args, "nb", 10 ); + int ib = run_arg_get_int( args, "ib", 10 ); + int P = parameters_getvalue_int( "P" ); + cham_trans_t transA = run_arg_get_trans( args, "transA", ChamNoTrans ); + cham_trans_t transB = run_arg_get_trans( args, "transB", ChamNoTrans ); + int N = run_arg_get_int( args, "N", 320 ); + int M = run_arg_get_int( args, "M", N ); + int K = run_arg_get_int( args, "K", N ); + CHAMELEON_Complex64_t alpha = testing_zalea(); + CHAMELEON_Complex64_t beta = testing_zalea(); + int seedA = run_arg_get_int( args, "seedA", random() ); + int seedB = run_arg_get_int( args, "seedB", random() ); + int seedC = run_arg_get_int( args, "seedC", random() ); + + /* Descriptors */ + int Am, An, Bm, Bn; + CHAM_desc_t *descA, *descB, *descC; + + alpha = run_arg_get_complex64( args, "alpha", alpha ); + beta = run_arg_get_complex64( args, "beta", beta ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, N ); + + /* Calculate the dimensions according to the transposition */ + if ( transA == ChamNoTrans ) { + Am = M; + An = K; + } + else { + Am = K; + An = M; + } + if ( transB == ChamNoTrans ) { + Bm = K; + Bn = N; + } + else { + Bm = N; + Bn = K; + } + + /* Create the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An ); + parameters_desc_create( "B", &descB, ChamComplexDouble, Bm, Bn, nb * Bm, ib * Bn, nb * Bm, ib * Bn ); + parameters_desc_create( "C", &descC, ChamComplexDouble, M, N, nb * M, ib * N, nb * M, ib * N ); + + /* Fill the matrices with random values */ + CHAMELEON_zplrnt_Tile( descA, seedA ); + CHAMELEON_zplrnt_Tile( descB, seedB ); + CHAMELEON_zplrnt_Tile( descC, seedC ); + + /* Start measurement */ + testing_start( &test_data ); + if ( async ) { + fprintf( stderr, "Async unavailable yet\n" ); + } + hres = CHAMELEON_zgemm_batch_Tile( transA, transB, alpha, descA, descB, beta, descC ); + + /* Stop measurement */ + test_data.hres = hres; + testing_stop( &test_data, flops_zgemm_batch( nb*ib, M, N, K ) ); + + /* Check the solution */ + if ( check ) { + fprintf( stderr, "Check is not available for zgemm_batch\n" ); + } + + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descB ); + parameters_desc_destroy( &descC ); + + return hres; +} + +testing_t test_zgemm_batch; +const char *zgemm_batch_params[] = { "nb", "ib", "transA", "transB", "m", "n", "k", + "alpha", "beta", "seedA", "seedB", "seedC", NULL }; +const char *zgemm_batch_output[] = { NULL }; +const char *zgemm_batch_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgemm_batch_init( void ) __attribute__( ( constructor ) ); +void +testing_zgemm_batch_init( void ) +{ + test_zgemm_batch.name = "zgemm_batch"; + test_zgemm_batch.helper = "Perform nb*ib general matrix-matrix multiply of size MxNxK"; + test_zgemm_batch.params = zgemm_batch_params; + test_zgemm_batch.output = zgemm_batch_output; + test_zgemm_batch.outchk = zgemm_batch_outchk; + test_zgemm_batch.fptr_desc = testing_zgemm_batch_desc; + test_zgemm_batch.fptr_std = NULL; + test_zgemm_batch.next = NULL; + + testing_register( &test_zgemm_batch ); +} diff --git a/testing/testing_zherk_batch.c b/testing/testing_zherk_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..47a6bc86a8a317649116046f4f47dbe938a17bfe --- /dev/null +++ b/testing/testing_zherk_batch.c @@ -0,0 +1,127 @@ +/** + * + * @file testing_zherk_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zherk_batch testing + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include <chameleon_lapack.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION) +#include <coreblas.h> +#endif + +static cham_fixdbl_t +flops_zherk_batch( int nb, int K, int N ) +{ + return flops_zherk( K, N ) * nb; +} + +int +testing_zherk_batch_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_int( args, "nb", 10 ); + int ib = run_arg_get_int( args, "ib", 10 ); + int P = parameters_getvalue_int( "P" ); + cham_trans_t trans = run_arg_get_trans( args, "trans", ChamNoTrans ); + cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper ); + int N = run_arg_get_int( args, "N", 320 ); + int K = run_arg_get_int( args, "K", N ); + double alpha = testing_dalea(); + double beta = testing_dalea(); + double bump = 0.; + int seedA = run_arg_get_int( args, "seedA", random() ); + int seedC = run_arg_get_int( args, "seedC", random() ); + + /* Descriptors */ + int Am, An; + CHAM_desc_t *descA, *descC; + + alpha = run_arg_get_double( args, "alpha", alpha ); + beta = run_arg_get_double( args, "beta", beta ); + bump = run_arg_get_double( args, "bump", 0. ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, N ); + + /* Calculate the dimensions according to the transposition */ + if ( trans == ChamNoTrans ) { + Am = N; + An = K; + } + else { + Am = K; + An = N; + } + + /* Create the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An ); + parameters_desc_create( "C", &descC, ChamComplexDouble, N, N, nb * N, ib * N, nb * N, ib * N ); + + /* Fill the matrices with random values */ + CHAMELEON_zplrnt_Tile( descA, seedA ); + CHAMELEON_zplghe_batch_Tile( bump, descC, seedC ); + + /* Start measurement */ + testing_start( &test_data ); + if ( async ) { + fprintf( stderr, "Async unavailable yet\n" ); + } + hres = CHAMELEON_zherk_batch_Tile( uplo, trans, alpha, descA, beta, descC ); + + /* Stop measurement */ + test_data.hres = hres; + testing_stop( &test_data, flops_zherk_batch( nb*ib, K, N ) ); + + /* Check the solution */ + if ( check ) { + fprintf( stderr, "Check is not available for zherk_batch\n" ); + } + + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descC ); + + return hres; +} + +testing_t test_zherk_batch; +const char *zherk_batch_params[] = { "nb", "ib", "trans", "uplo", "n", "k", + "alpha", "beta", "seedA", "seedC", "bump", NULL }; +const char *zherk_batch_output[] = { NULL }; +const char *zherk_batch_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zherk_batch_init( void ) __attribute__( ( constructor ) ); +void +testing_zherk_batch_init( void ) +{ + test_zherk_batch.name = "zherk_batch"; + test_zherk_batch.helper = "Perform nb*ib rank-k updates zherk( uplo, trans, N, K, ... )"; + test_zherk_batch.params = zherk_batch_params; + test_zherk_batch.output = zherk_batch_output; + test_zherk_batch.outchk = zherk_batch_outchk; + test_zherk_batch.fptr_desc = testing_zherk_batch_desc; + test_zherk_batch.fptr_std = NULL; + test_zherk_batch.next = NULL; + + testing_register( &test_zherk_batch ); +} diff --git a/testing/testing_zpotrf_batch.c b/testing/testing_zpotrf_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..eb4b6d1156c0bea4c0bdc18ebc839eb8d98f0be3 --- /dev/null +++ b/testing/testing_zpotrf_batch.c @@ -0,0 +1,103 @@ +/** + * + * @file testing_zpotrf_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpotrf_batch testing + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include <chameleon_lapack.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION) +#include <coreblas.h> +#endif + +static cham_fixdbl_t +flops_zpotrf_batch( int nb, int N ) +{ + return flops_zpotrf( N ) * nb; +} + +int +testing_zpotrf_batch_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_int( args, "nb", 10 ); + int ib = run_arg_get_int( args, "ib", 10 ); + int P = parameters_getvalue_int( "P" ); + cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper ); + int N = run_arg_get_int( args, "N", 320 ); + int seedA = run_arg_get_int( args, "seedA", random() ); + int Q = parameters_compute_q( P ); + + /* Descriptors */ + CHAM_desc_t *descA; + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, N ); + + /* Create the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, N, N, nb * N, ib * N, nb * N, ib * N ); + + /* Fill the matrices with random values */ + CHAMELEON_zplghe_batch_Tile( (double)N, descA, seedA ); + + /* Start measurement */ + testing_start( &test_data ); + if ( async ) { + fprintf( stderr, "Async unavailable yet\n" ); + } + hres = CHAMELEON_zpotrf_batch_Tile( uplo, descA ); + + /* Stop measurement */ + test_data.hres = hres; + testing_stop( &test_data, flops_zpotrf_batch( nb*ib, N ) ); + + /* Check the solution */ + if ( check ) { + fprintf( stderr, "Check is not available for gemm_batch\n" ); + } + + parameters_desc_destroy( &descA ); + + return hres; +} + +testing_t test_zpotrf_batch; +const char *zpotrf_batch_params[] = { "nb", "ib", "uplo", "n", "seedA", NULL }; +const char *zpotrf_batch_output[] = { NULL }; +const char *zpotrf_batch_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zpotrf_batch_init( void ) __attribute__( ( constructor ) ); +void +testing_zpotrf_batch_init( void ) +{ + test_zpotrf_batch.name = "zpotrf_batch"; + test_zpotrf_batch.helper = "Perform nb*ib Cholesky factorization potrf( uplo, N, ... )"; + test_zpotrf_batch.params = zpotrf_batch_params; + test_zpotrf_batch.output = zpotrf_batch_output; + test_zpotrf_batch.outchk = zpotrf_batch_outchk; + test_zpotrf_batch.fptr_desc = testing_zpotrf_batch_desc; + test_zpotrf_batch.fptr_std = NULL; + test_zpotrf_batch.next = NULL; + + testing_register( &test_zpotrf_batch ); +} diff --git a/testing/testing_ztrsm_batch.c b/testing/testing_ztrsm_batch.c new file mode 100644 index 0000000000000000000000000000000000000000..44702794fb75f56d247c507165e8aeac6be11081 --- /dev/null +++ b/testing/testing_ztrsm_batch.c @@ -0,0 +1,125 @@ +/** + * + * @file testing_ztrsm_batch.c + * + * @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon ztrsm_batch testing + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2024-04-03 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include <chameleon_lapack.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> +#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION) +#include <coreblas.h> +#endif + +static cham_fixdbl_t +flops_ztrsm_batch( int nb, cham_side_t side, int M, int N ) +{ + return flops_ztrsm( side, M, N ) * nb; +} + +int +testing_ztrsm_batch_desc( run_arg_list_t *args, int check ) +{ + testdata_t test_data = { .args = args }; + int hres = 0; + + /* Read arguments */ + int async = parameters_getvalue_int( "async" ); + int nb = run_arg_get_int( args, "nb", 10 ); + int ib = run_arg_get_int( args, "ib", 10 ); + int P = parameters_getvalue_int( "P" ); + cham_trans_t trans = run_arg_get_trans( args, "trans", ChamNoTrans ); + cham_side_t side = run_arg_get_side( args, "side", ChamLeft ); + cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper ); + cham_diag_t diag = run_arg_get_diag( args, "diag", ChamNonUnit ); + int N = run_arg_get_int( args, "N", 320 ); + int M = run_arg_get_int( args, "M", N ); + CHAMELEON_Complex64_t alpha = testing_zalea(); + int seedA = run_arg_get_int( args, "seedA", random() ); + int seedB = run_arg_get_int( args, "seedB", random() ); + + /* Descriptors */ + int Am, An, Bm, Bn; + CHAM_desc_t *descA, *descB, *descC; + + alpha = run_arg_get_complex64( args, "alpha", alpha ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, N ); + + /* Calculate the dimensions according to the transposition */ + if ( side == ChamLeft ) { + Am = M; + An = M; + } + else { + Am = N; + An = N; + } + + /* Create the matrices */ + parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An ); + parameters_desc_create( "B", &descB, ChamComplexDouble, M, N, nb * M, ib * N, nb * M, ib * N ); + + /* Fill the matrices with random values */ + CHAMELEON_zplrnt_Tile( descA, seedA ); + CHAMELEON_zplrnt_Tile( descB, seedB ); + + /* Start measurement */ + testing_start( &test_data ); + if ( async ) { + fprintf( stderr, "Async unavailable yet\n" ); + } + hres = CHAMELEON_ztrsm_batch_Tile( side, uplo, trans, diag, alpha, descA, descB ); + + /* Stop measurement */ + test_data.hres = hres; + testing_stop( &test_data, flops_ztrsm_batch( nb*ib, side, M, N ) ); + + /* Check the solution */ + if ( check ) { + fprintf( stderr, "Check is not available for ztrsm_batch\n" ); + } + + parameters_desc_destroy( &descA ); + parameters_desc_destroy( &descB ); + + return hres; +} + +testing_t test_ztrsm_batch; +const char *ztrsm_batch_params[] = { "nb", "ib", "side", "uplo", "trans", "diag", "m", "n", + "alpha", "seedA", "seedB", NULL }; +const char *ztrsm_batch_output[] = { NULL }; +const char *ztrsm_batch_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_ztrsm_batch_init( void ) __attribute__( ( constructor ) ); +void +testing_ztrsm_batch_init( void ) +{ + test_ztrsm_batch.name = "ztrsm_batch"; + test_ztrsm_batch.helper = "Perform nb*ib triangular solve trsm( side, uplo, trns, diag, M, N, ... )"; + test_ztrsm_batch.params = ztrsm_batch_params; + test_ztrsm_batch.output = ztrsm_batch_output; + test_ztrsm_batch.outchk = ztrsm_batch_outchk; + test_ztrsm_batch.fptr_desc = testing_ztrsm_batch_desc; + test_ztrsm_batch.fptr_std = NULL; + test_ztrsm_batch.next = NULL; + + testing_register( &test_ztrsm_batch ); +}