Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8d2fdd45 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'feature/batch_blas' into 'master'

Add batch blas kernels (POTRF, HERK, GEMM, TRSM and PLGHE) through Tile interface with their examples

See merge request !456
parents 8a4db094 ae661422
No related branches found
No related tags found
1 merge request!456Add batch blas kernels (POTRF, HERK, GEMM, TRSM and PLGHE) through Tile interface with their examples
......@@ -27,7 +27,7 @@
# @author Alycia Lisito
# @author Loris Lucido
# @author Matthieu Kuhn
# @date 2023-08-22
# @date 2024-04-03
#
###
......@@ -261,6 +261,16 @@ set(ZSRC
zgenm2.c
pzgenm2.c
zprint.c
##################
# Batch
##################
zplghe_batch.c
#
zgemm_batch.c
zherk_batch.c
ztrsm_batch.c
#
zpotrf_batch.c
)
precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}"
......
......@@ -11,12 +11,12 @@
*
* @brief Chameleon zgeadd wrappers
*
* @version 1.2.0
* @version 1.3.0
* @comment This file has been automatically generated
* from Plasma 2.5.0 for CHAMELEON 0.9.2
* @author Mathieu Faverge
* @author Florent Pruvost
* @date 2022-02-22
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
......@@ -30,7 +30,7 @@
* CHAMELEON_zgeadd - Performs a matrix addition similarly to the pzgeadd()
* function from the PBLAS library:
*
* \f[ C = \alpha op( A ) + \beta B \f],
* \f[ B = \alpha op( A ) + \beta B \f],
*
* where op( X ) is one of
*
......
/**
*
* @file zgemm_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon batch zgemm wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#if !defined(CHAMELEON_SIMULATION)
#include "coreblas/coreblas_ztile.h"
#if defined(CHAMELEON_USE_CUDA)
#include "gpucublas/gpucublas_z.h"
#endif
#endif
struct zgemm_batch_args_s {
cham_trans_t transA;
cham_trans_t transB;
CHAMELEON_Complex64_t alpha;
CHAMELEON_Complex64_t beta;
};
typedef struct zgemm_batch_args_s zgemm_batch_args_t;
#if !defined(CHAMELEON_SIMULATION)
static inline int
zgemm_batch_cpu( void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
zgemm_batch_args_t *args = (zgemm_batch_args_t*)op_args;
const CHAM_desc_t *descB;
CHAM_tile_t *tileB;
const CHAM_desc_t *descC;
CHAM_tile_t *tileC;
va_list ap;
int tempmm, tempnn, tempkk;
if ( ndata != 3 ) {
fprintf( stderr, "zgemm_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 3 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descB = va_arg(ap, const CHAM_desc_t *);
tileB = va_arg(ap, CHAM_tile_t *);
descC = va_arg(ap, const CHAM_desc_t *);
tileC = va_arg(ap, CHAM_tile_t *);
va_end(ap);
tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb;
tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb;
if ( args->transA == ChamNoTrans ) {
tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
}
else {
tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
}
TCORE_zgemm(
args->transA, args->transB, tempmm, tempnn, tempkk,
args->alpha, tileA, tileB, args->beta, tileC );
(void)descB;
(void)uplo;
return 0;
}
#else
#define zgemm_batch_cpu NULL
#endif
#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA)
static inline int
zgemm_batch_cuda( cublasHandle_t handle, void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
zgemm_batch_args_t *args = (zgemm_batch_args_t*)op_args;
const CHAM_desc_t *descB;
CHAM_tile_t *tileB;
const CHAM_desc_t *descC;
CHAM_tile_t *tileC;
va_list ap;
int tempmm, tempnn, tempkk;
if ( ndata != 3 ) {
fprintf( stderr, "zgemm_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 3 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descB = va_arg(ap, const CHAM_desc_t *);
tileB = va_arg(ap, CHAM_tile_t *);
descC = va_arg(ap, const CHAM_desc_t *);
tileC = va_arg(ap, CHAM_tile_t *);
va_end(ap);
tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb;
tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb;
if ( args->transA == ChamNoTrans ) {
tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
}
else {
tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
}
CUDA_zgemm( args->transA, args->transB, tempmm, tempnn, tempkk,
(cuDoubleComplex*)&(args->alpha),
tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(cuDoubleComplex*)&(args->beta),
tileC->mat, tileC->ld,
handle );
(void)descB;
(void)uplo;
return 0;
}
#else
#define zgemm_batch_cuda NULL
#endif
static cham_map_operator_t zgemm_batch_map = {
.name = "zgemm",
.cpufunc = zgemm_batch_cpu,
.cudafunc = zgemm_batch_cuda,
.hipfunc = NULL,
};
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* CHAMELEON_zgemm_batch_Tile - Performs multiple matrix multiplication in parallel.
*
*******************************************************************************
*
* @param[in] transA
* Specifies whether the tiles from A are transposed, not transposed or conjugate transposed:
* = ChamNoTrans: tiles from A are not transposed;
* = ChamTrans: tiles from A are transposed;
* = ChamConjTrans: tiles from A are conjugate transposed.
*
* @param[in] transB
* Specifies whether the tiles from B are transposed, not transposed or conjugate transposed:
* = ChamNoTrans: tiles from B are not transposed;
* = ChamTrans: tiles from B are transposed;
* = ChamConjTrans: tiles from B are conjugate transposed.
*
* @param[in] alpha
* alpha specifies the scalar alpha
*
* @param[in] A
* A is a collection of mt-by-nt tiles of size A->mb by A->nb
*
* @param[in] B
* B is a collection of mt-by-nt tiles of size B->mb by B->nb
*
* @param[in] beta
* beta specifies the scalar beta
*
* @param[in,out] C
* C is a collection of mt-by-nt tiles of size C->mb by C->nb
* On exit, each tile Cij is overwritten by the matrix:
* \f[ alpha * op( A[i,j] )*op( B[i,j] ) * C[i,j] \f]
*
*******************************************************************************
*
* @return CHAMELEON_SUCCESS on successful exit
* @return CHAMELEON_ERR_... on error
*
*/
int CHAMELEON_zgemm_batch_Tile( cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
cham_map_data_t data[3];
zgemm_batch_args_t params = { transA, transB, alpha, beta };
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zgemm_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
data[0].access = ChamR;
data[0].desc = A;
data[1].access = ChamR;
data[1].desc = B;
data[2].access = ( beta == 0. ) ? ChamW : ChamRW;
data[2].desc = C;
chameleon_pmap( ChamUpperLower, 3, data, &zgemm_batch_map, &params, sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Desc_Flush( B, sequence );
CHAMELEON_Desc_Flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
/**
*
* @file zherk_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon batch zherk wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#if !defined(CHAMELEON_SIMULATION)
#include "coreblas/coreblas_ztile.h"
#if defined(CHAMELEON_USE_CUDA)
#include "gpucublas/gpucublas_z.h"
#endif
#endif
struct zherk_batch_args_s {
cham_uplo_t uplo;
cham_trans_t trans;
double alpha;
double beta;
};
typedef struct zherk_batch_args_s zherk_batch_args_t;
#if !defined(CHAMELEON_SIMULATION)
static inline int
zherk_batch_cpu( void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
zherk_batch_args_t *args = (zherk_batch_args_t*)op_args;
const CHAM_desc_t *descC;
CHAM_tile_t *tileC;
va_list ap;
int tempnn, tempkk;
if ( ndata != 2 ) {
fprintf( stderr, "zherk_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 2 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descC = va_arg(ap, const CHAM_desc_t *);
tileC = va_arg(ap, CHAM_tile_t *);
va_end(ap);
tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb;
if ( args->trans == ChamNoTrans ) {
tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
}
else {
tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
}
TCORE_zherk(
args->uplo, args->trans, tempnn, tempkk,
args->alpha, tileA, args->beta, tileC );
(void)uplo;
return 0;
}
#else
#define zherk_batch_cpu NULL
#endif
#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA)
static inline int
zherk_batch_cuda( cublasHandle_t handle, void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
zherk_batch_args_t *args = (zherk_batch_args_t*)op_args;
const CHAM_desc_t *descC;
CHAM_tile_t *tileC;
va_list ap;
int tempnn, tempkk;
if ( ndata != 2 ) {
fprintf( stderr, "zherk_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 2 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descC = va_arg(ap, const CHAM_desc_t *);
tileC = va_arg(ap, CHAM_tile_t *);
va_end(ap);
tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb;
if ( args->trans == ChamNoTrans ) {
tempkk = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
}
else {
tempkk = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb;
}
CUDA_zherk( args->uplo, args->trans, tempnn, tempkk,
&(args->alpha), tileA->mat, tileA->ld,
&(args->beta), tileC->mat, tileC->ld,
handle );
(void)uplo;
return 0;
}
#else
#define zherk_batch_cuda NULL
#endif
static cham_map_operator_t zherk_batch_map = {
.name = "zherk",
.cpufunc = zherk_batch_cpu,
.cudafunc = zherk_batch_cuda,
.hipfunc = NULL,
};
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* CHAMELEON_zherk_batch_Tile - Performs multiple rank-k update in parallel.
*
*******************************************************************************
*
* @param[in] uplo
* = ChamUpper: Upper triangle of C tiles are stored;
* = ChamLower: Lower triangle of C tiles are stored.
*
* @param[in] trans
* Specifies whether the tiles of A are transposed or conjugate transposed:
* = ChamNoTrans: tiles of A are not transposed;
* = ChamConjTrans: tiles of A are conjugate transposed.
*
* @param[in] alpha
* alpha specifies the scalar alpha
*
* @param[in] A
* A is a collection of mt-by-nt tiles of size A->mb by A->nb
*
* @param[in] beta
* beta specifies the scalar beta
*
* @param[in,out] B
* B is a collection of mt-by-nt tiles of size B->mb by B->nb
* On exit, each tile B[i,j] is overwritten by
* \f[ B = \alpha [ op( A ) \times conjg( op( A )' )] + \beta B \f],
*
* where op( X ) is one of
*
* op( X ) = X or op( X ) = conjg( X' )
*
*******************************************************************************
*
* @return CHAMELEON_SUCCESS on successful exit
* @return CHAMELEON_ERR_... on error
*
*/
int CHAMELEON_zherk_batch_Tile( cham_uplo_t uplo, cham_trans_t trans,
double alpha, CHAM_desc_t *A,
double beta, CHAM_desc_t *C )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
cham_map_data_t data[2];
zherk_batch_args_t params = { uplo, trans, alpha, beta };
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zherk_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
data[0].access = ChamR;
data[0].desc = A;
data[1].access = ( beta == 0. ) ? ChamW : ChamRW;
data[1].desc = C;
chameleon_pmap( ChamUpperLower, 2, data, &zherk_batch_map, &params, sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Desc_Flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
/**
*
* @file zplghe_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon batch zplghe wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#if !defined(CHAMELEON_SIMULATION)
#include "coreblas/coreblas_ztile.h"
#if defined(CHAMELEON_USE_CUDA)
#include "gpucublas/gpucublas_z.h"
#endif
#endif
struct zplghe_batch_args_s {
double bump;
unsigned long long int seed;
};
typedef struct zplghe_batch_args_s zplghe_batch_args_t;
#if !defined(CHAMELEON_SIMULATION)
static inline int
zplghe_batch_cpu( void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
zplghe_batch_args_t *args = (zplghe_batch_args_t*)op_args;
int tempnn, m0;
if ( ndata != 1 ) {
fprintf( stderr, "zplghe_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
}
tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
/**
* Let's fo the math to give the right bigM:
* jump for the first value is defined as j = m0 + n0 * bigM
* We need to set (m*n) matrices of size A->mb*A->nb, and we want j, m0, n0 to be defined as:
* j = m0 = n0 = (n * A->mt + m) * (A->mb * A->nb)
* Thus:
* bigM = 0;
*/
m0 = ( n * descA->mt + m ) * (descA->mb * descA->nb );
TCORE_zplghe( args->bump, tempnn, tempnn, tileA,
0, m0, m0, args->seed );
(void)uplo;
return 0;
}
#else
#define zplghe_batch_cpu NULL
#endif
static cham_map_operator_t zplghe_batch_map = {
.name = "zplghe",
.cpufunc = zplghe_batch_cpu,
.cudafunc = NULL,
.hipfunc = NULL,
};
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* CHAMELEON_zplghe_batch_Tile - Performs multiple hermitian matrix generation
* in parallel.
*
*******************************************************************************
*
* @param[in] bump
* The value to add to the diagonal of each tile to be sure
* they are positive definite matrices.
*
* @param[in] seed
* The seed used in the random generation.
*
* @param[in,out] A
* A is a collection of mt-by-nt tiles of size A->mb by A->nb
* On exit, each tile is initialized as an hermitian matrix.
*
*******************************************************************************
*
* @return CHAMELEON_SUCCESS on successful exit
* @return CHAMELEON_ERR_... on error
*
*/
int CHAMELEON_zplghe_batch_Tile( double bump, CHAM_desc_t *A,
unsigned long long int seed )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
cham_map_data_t data[1];
zplghe_batch_args_t params = { bump, seed };
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zplghe_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
data[0].access = ChamW;
data[0].desc = A;
chameleon_pmap( ChamUpperLower, 1, data,
&zplghe_batch_map, &params,
sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
/**
*
* @file zpotrf_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon batch zpotrf wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#if !defined(CHAMELEON_SIMULATION)
#include "coreblas/coreblas_ztile.h"
#if defined(CHAMELEON_USE_CUDA)
#include "gpucublas/gpucublas_z.h"
#endif
#endif
#if !defined(CHAMELEON_SIMULATION)
static inline int
zpotrf_batch_cpu( void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
cham_uplo_t luplo = (intptr_t)op_args;
int info = 0;
int tempnn;
if ( ndata != 1 ) {
fprintf( stderr, "zpotrf_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
}
tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb;
TCORE_zpotrf(
luplo, tempnn, tileA, &info );
if ( info != 0 ) {
fprintf( stderr, "zpotrf_batch_cpu: Failed to correctly factorize the tile (info = %d)\n", info );
}
(void)uplo;
return 0;
}
#else
#define zpotrf_batch_cpu NULL
#endif
static cham_map_operator_t zpotrf_batch_map = {
.name = "zpotrf",
.cpufunc = zpotrf_batch_cpu,
.cudafunc = NULL,
.hipfunc = NULL,
};
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* CHAMELEON_zpotrf_batch_Tile - Performs multiple Cholesky factorization in parallel.
*
*******************************************************************************
*
* @param[in] uplo
* = ChamUpper: Upper triangle of A tiles are stored;
* = ChamLower: Lower triangle of A tiles are stored.
*
* @param[in,out] A
* A is a collection of mt-by-nt tiles of size A->mb by A->nb
* On exit, each tile is factorized with Cholesky factorization.
*
*******************************************************************************
*
* @return CHAMELEON_SUCCESS on successful exit
* @return CHAMELEON_ERR_... on error
*
*/
int CHAMELEON_zpotrf_batch_Tile( cham_uplo_t uplo, CHAM_desc_t *A )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
cham_map_data_t data[1];
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zpotrf_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
data[0].access = ChamRW;
data[0].desc = A;
chameleon_pmap( ChamUpperLower, 1, data,
&zpotrf_batch_map, (void*)((intptr_t)uplo),
sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
/**
*
* @file ztrsm_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon batch ztrsm wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#if !defined(CHAMELEON_SIMULATION)
#include "coreblas/coreblas_ztile.h"
#if defined(CHAMELEON_USE_CUDA)
#include "gpucublas/gpucublas_z.h"
#endif
#endif
struct ztrsm_batch_args_s {
cham_side_t side;
cham_uplo_t uplo;
cham_trans_t transA;
cham_diag_t diag;
CHAMELEON_Complex64_t alpha;
};
typedef struct ztrsm_batch_args_s ztrsm_batch_args_t;
#if !defined(CHAMELEON_SIMULATION)
static inline int
ztrsm_batch_cpu( void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
ztrsm_batch_args_t *args = (ztrsm_batch_args_t*)op_args;
const CHAM_desc_t *descB;
CHAM_tile_t *tileB;
va_list ap;
int tempmm, tempnn;
if ( ndata != 2 ) {
fprintf( stderr, "ztrsm_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 2 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descB = va_arg(ap, const CHAM_desc_t *);
tileB = va_arg(ap, CHAM_tile_t *);
va_end(ap);
tempmm = m == descB->mt-1 ? descB->m - m * descB->mb : descB->mb;
tempnn = n == descB->nt-1 ? descB->n - n * descB->nb : descB->nb;
TCORE_ztrsm(
args->side, args->uplo, args->transA, args->diag,
tempmm, tempnn, args->alpha, tileA, tileB );
(void)descA;
(void)descB;
(void)uplo;
return 0;
}
#else
#define ztrsm_batch_cpu NULL
#endif
#if !defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_USE_CUDA)
static inline int
ztrsm_batch_cuda( cublasHandle_t handle, void *op_args,
cham_uplo_t uplo, int m, int n, int ndata,
const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
{
ztrsm_batch_args_t *args = (ztrsm_batch_args_t*)op_args;
const CHAM_desc_t *descB;
CHAM_tile_t *tileB;
va_list ap;
int tempmm, tempnn;
if ( ndata != 2 ) {
fprintf( stderr, "ztrsm_batch_cpu: requires two pieces of data and %d have been given\n", ndata );
if ( ndata < 2 ) {
return -1;
}
}
/* Get the second desc */
va_start(ap, tileA);
descB = va_arg(ap, const CHAM_desc_t *);
tileB = va_arg(ap, CHAM_tile_t *);
va_end(ap);
assert( tileA->format & CHAMELEON_TILE_FULLRANK );
assert( tileB->format & CHAMELEON_TILE_FULLRANK );
tempmm = m == descB->mt-1 ? descB->m - m * descB->mb : descB->mb;
tempnn = n == descB->nt-1 ? descB->n - n * descB->nb : descB->nb;
CUDA_ztrsm(
args->side, args->uplo, args->transA, args->diag,
tempmm, tempnn, (cuDoubleComplex*)&(args->alpha),
tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
handle );
(void)descA;
(void)descB;
(void)uplo;
return 0;
}
#else
#define ztrsm_batch_cuda NULL
#endif
static cham_map_operator_t ztrsm_batch_map = {
.name = "ztrsm",
.cpufunc = ztrsm_batch_cpu,
.cudafunc = ztrsm_batch_cuda,
.hipfunc = NULL,
};
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* CHAMELEON_ztrsm_batch_Tile - Performs multiple triangular solves in parallel.
*
*******************************************************************************
*
* @param[in] side
* Specifies whether tiles of A appears on the left or on the right of tiles of X:
* = ChamLeft: A[i,j] * X[i,j] = B[i,j]
* = ChamRight: X[i,j] * A[i,j] = B[i,j]
*
* @param[in] uplo
* Specifies whether the matrix A is upper triangular or lower triangular:
* = ChamUpper: Upper triangle of tiles of A are stored;
* = ChamLower: Lower triangle of tiles of A are stored.
*
* @param[in] trans
* Specifies whether the matrix A is transposed, not transposed or conjugate transposed:
* = ChamNoTrans: tiles of A are transposed;
* = ChamTrans: tiles of A are not transposed;
* = ChamConjTrans: tiles of A are conjugate transposed.
*
* @param[in] diag
* Specifies whether or not A is unit triangular:
* = ChamNonUnit: tiles of A are non unit;
* = ChamUnit: tiles of A are unit.
*
* @param[in] alpha
* alpha specifies the scalar alpha
*
* @param[in] A
* A is a collection of mt-by-nt tiles of size A->mb by A->nb
*
* @param[in,out] B
* B is a collection of mt-by-nt tiles of size B->mb by B->nb
* On exit, each tile B[i,j] is overwritten by X[i,j]
*
*******************************************************************************
*
* @return CHAMELEON_SUCCESS on successful exit
* @return CHAMELEON_ERR_... on error
*
*/
int CHAMELEON_ztrsm_batch_Tile( cham_side_t side, cham_uplo_t uplo,
cham_trans_t trans, cham_diag_t diag,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
cham_map_data_t data[2];
ztrsm_batch_args_t params = { side, uplo, trans, diag, alpha };
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_ztrsm_batch_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
data[0].access = ChamR;
data[0].desc = A;
data[1].access = ChamRW;
data[1].desc = B;
chameleon_pmap( ChamUpperLower, 2, data, &ztrsm_batch_map, &params, sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Desc_Flush( B, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
......@@ -23,7 +23,7 @@
* @author Florent Pruvost
* @author Alycia Lisito
* @author Matthieu Kuhn
* @date 2024-03-14
* @date 2024-04-03
* @precisions normal z -> c d s
*
*/
......@@ -394,6 +394,15 @@ int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, void *user_ws,
int CHAMELEON_zprint( FILE *file, const char *header, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, int LDA );
int CHAMELEON_zprint_Tile( FILE *file, const char *header, cham_uplo_t uplo, CHAM_desc_t *descA );
/**
* Batch function prototypes - Tile interface
*/
int CHAMELEON_zgemm_batch_Tile( cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C );
int CHAMELEON_zherk_batch_Tile( cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C );
int CHAMELEON_zplghe_batch_Tile( double bump, CHAM_desc_t *A, unsigned long long int seed );
int CHAMELEON_zpotrf_batch_Tile( cham_uplo_t uplo, CHAM_desc_t *A );
int CHAMELEON_ztrsm_batch_Tile( cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, cham_diag_t diag, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B );
END_C_DECLS
#endif /* _chameleon_z_h_ */
......@@ -17,7 +17,7 @@
# Univ. of California Berkeley,
# Univ. of Colorado Denver.
#
# @version 1.2.0
# @version 1.3.0
# @author Cedric Castagnede
# @author Emmanuel Agullo
# @author Mathieu Faverge
......@@ -25,7 +25,7 @@
# @author Florent Pruvost
# @author Alycia Lisito
# @author Matthieu Kuhn
# @date 2022-02-22
# @date 2024-04-03
#
###
......@@ -105,7 +105,14 @@ set(ZSRC_WO_STDAPI
testing_zplrnk.c
testing_zcesca.c
testing_zgram.c
)
#
# Batch kernels
#
testing_zgemm_batch.c
testing_zherk_batch.c
testing_zpotrf_batch.c
testing_ztrsm_batch.c
)
foreach(_precision ${CHAMELEON_PRECISION} )
......
/**
*
* @file testing_zgemm_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zgemm_batch testing
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> c d s
*
*/
#include <chameleon.h>
#include <chameleon_lapack.h>
#include "testings.h"
#include "testing_zcheck.h"
#include <chameleon/flops.h>
#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION)
#include <coreblas.h>
#endif
static cham_fixdbl_t
flops_zgemm_batch( int nb, int M, int N, int K )
{
return flops_zgemm( M, N, K ) * nb;
}
int
testing_zgemm_batch_desc( run_arg_list_t *args, int check )
{
testdata_t test_data = { .args = args };
int hres = 0;
/* Read arguments */
int async = parameters_getvalue_int( "async" );
int nb = run_arg_get_int( args, "nb", 10 );
int ib = run_arg_get_int( args, "ib", 10 );
int P = parameters_getvalue_int( "P" );
cham_trans_t transA = run_arg_get_trans( args, "transA", ChamNoTrans );
cham_trans_t transB = run_arg_get_trans( args, "transB", ChamNoTrans );
int N = run_arg_get_int( args, "N", 320 );
int M = run_arg_get_int( args, "M", N );
int K = run_arg_get_int( args, "K", N );
CHAMELEON_Complex64_t alpha = testing_zalea();
CHAMELEON_Complex64_t beta = testing_zalea();
int seedA = run_arg_get_int( args, "seedA", random() );
int seedB = run_arg_get_int( args, "seedB", random() );
int seedC = run_arg_get_int( args, "seedC", random() );
/* Descriptors */
int Am, An, Bm, Bn;
CHAM_desc_t *descA, *descB, *descC;
alpha = run_arg_get_complex64( args, "alpha", alpha );
beta = run_arg_get_complex64( args, "beta", beta );
CHAMELEON_Set( CHAMELEON_TILE_SIZE, N );
/* Calculate the dimensions according to the transposition */
if ( transA == ChamNoTrans ) {
Am = M;
An = K;
}
else {
Am = K;
An = M;
}
if ( transB == ChamNoTrans ) {
Bm = K;
Bn = N;
}
else {
Bm = N;
Bn = K;
}
/* Create the matrices */
parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An );
parameters_desc_create( "B", &descB, ChamComplexDouble, Bm, Bn, nb * Bm, ib * Bn, nb * Bm, ib * Bn );
parameters_desc_create( "C", &descC, ChamComplexDouble, M, N, nb * M, ib * N, nb * M, ib * N );
/* Fill the matrices with random values */
CHAMELEON_zplrnt_Tile( descA, seedA );
CHAMELEON_zplrnt_Tile( descB, seedB );
CHAMELEON_zplrnt_Tile( descC, seedC );
/* Start measurement */
testing_start( &test_data );
if ( async ) {
fprintf( stderr, "Async unavailable yet\n" );
}
hres = CHAMELEON_zgemm_batch_Tile( transA, transB, alpha, descA, descB, beta, descC );
/* Stop measurement */
test_data.hres = hres;
testing_stop( &test_data, flops_zgemm_batch( nb*ib, M, N, K ) );
/* Check the solution */
if ( check ) {
fprintf( stderr, "Check is not available for zgemm_batch\n" );
}
parameters_desc_destroy( &descA );
parameters_desc_destroy( &descB );
parameters_desc_destroy( &descC );
return hres;
}
testing_t test_zgemm_batch;
const char *zgemm_batch_params[] = { "nb", "ib", "transA", "transB", "m", "n", "k",
"alpha", "beta", "seedA", "seedB", "seedC", NULL };
const char *zgemm_batch_output[] = { NULL };
const char *zgemm_batch_outchk[] = { "RETURN", NULL };
/**
* @brief Testing registration function
*/
void testing_zgemm_batch_init( void ) __attribute__( ( constructor ) );
void
testing_zgemm_batch_init( void )
{
test_zgemm_batch.name = "zgemm_batch";
test_zgemm_batch.helper = "Perform nb*ib general matrix-matrix multiply of size MxNxK";
test_zgemm_batch.params = zgemm_batch_params;
test_zgemm_batch.output = zgemm_batch_output;
test_zgemm_batch.outchk = zgemm_batch_outchk;
test_zgemm_batch.fptr_desc = testing_zgemm_batch_desc;
test_zgemm_batch.fptr_std = NULL;
test_zgemm_batch.next = NULL;
testing_register( &test_zgemm_batch );
}
/**
*
* @file testing_zherk_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zherk_batch testing
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> c d s
*
*/
#include <chameleon.h>
#include <chameleon_lapack.h>
#include "testings.h"
#include "testing_zcheck.h"
#include <chameleon/flops.h>
#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION)
#include <coreblas.h>
#endif
static cham_fixdbl_t
flops_zherk_batch( int nb, int K, int N )
{
return flops_zherk( K, N ) * nb;
}
int
testing_zherk_batch_desc( run_arg_list_t *args, int check )
{
testdata_t test_data = { .args = args };
int hres = 0;
/* Read arguments */
int async = parameters_getvalue_int( "async" );
int nb = run_arg_get_int( args, "nb", 10 );
int ib = run_arg_get_int( args, "ib", 10 );
int P = parameters_getvalue_int( "P" );
cham_trans_t trans = run_arg_get_trans( args, "trans", ChamNoTrans );
cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper );
int N = run_arg_get_int( args, "N", 320 );
int K = run_arg_get_int( args, "K", N );
double alpha = testing_dalea();
double beta = testing_dalea();
double bump = 0.;
int seedA = run_arg_get_int( args, "seedA", random() );
int seedC = run_arg_get_int( args, "seedC", random() );
/* Descriptors */
int Am, An;
CHAM_desc_t *descA, *descC;
alpha = run_arg_get_double( args, "alpha", alpha );
beta = run_arg_get_double( args, "beta", beta );
bump = run_arg_get_double( args, "bump", 0. );
CHAMELEON_Set( CHAMELEON_TILE_SIZE, N );
/* Calculate the dimensions according to the transposition */
if ( trans == ChamNoTrans ) {
Am = N;
An = K;
}
else {
Am = K;
An = N;
}
/* Create the matrices */
parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An );
parameters_desc_create( "C", &descC, ChamComplexDouble, N, N, nb * N, ib * N, nb * N, ib * N );
/* Fill the matrices with random values */
CHAMELEON_zplrnt_Tile( descA, seedA );
CHAMELEON_zplghe_batch_Tile( bump, descC, seedC );
/* Start measurement */
testing_start( &test_data );
if ( async ) {
fprintf( stderr, "Async unavailable yet\n" );
}
hres = CHAMELEON_zherk_batch_Tile( uplo, trans, alpha, descA, beta, descC );
/* Stop measurement */
test_data.hres = hres;
testing_stop( &test_data, flops_zherk_batch( nb*ib, K, N ) );
/* Check the solution */
if ( check ) {
fprintf( stderr, "Check is not available for zherk_batch\n" );
}
parameters_desc_destroy( &descA );
parameters_desc_destroy( &descC );
return hres;
}
testing_t test_zherk_batch;
const char *zherk_batch_params[] = { "nb", "ib", "trans", "uplo", "n", "k",
"alpha", "beta", "seedA", "seedC", "bump", NULL };
const char *zherk_batch_output[] = { NULL };
const char *zherk_batch_outchk[] = { "RETURN", NULL };
/**
* @brief Testing registration function
*/
void testing_zherk_batch_init( void ) __attribute__( ( constructor ) );
void
testing_zherk_batch_init( void )
{
test_zherk_batch.name = "zherk_batch";
test_zherk_batch.helper = "Perform nb*ib rank-k updates zherk( uplo, trans, N, K, ... )";
test_zherk_batch.params = zherk_batch_params;
test_zherk_batch.output = zherk_batch_output;
test_zherk_batch.outchk = zherk_batch_outchk;
test_zherk_batch.fptr_desc = testing_zherk_batch_desc;
test_zherk_batch.fptr_std = NULL;
test_zherk_batch.next = NULL;
testing_register( &test_zherk_batch );
}
/**
*
* @file testing_zpotrf_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zpotrf_batch testing
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> c d s
*
*/
#include <chameleon.h>
#include <chameleon_lapack.h>
#include "testings.h"
#include "testing_zcheck.h"
#include <chameleon/flops.h>
#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION)
#include <coreblas.h>
#endif
static cham_fixdbl_t
flops_zpotrf_batch( int nb, int N )
{
return flops_zpotrf( N ) * nb;
}
int
testing_zpotrf_batch_desc( run_arg_list_t *args, int check )
{
testdata_t test_data = { .args = args };
int hres = 0;
/* Read arguments */
int async = parameters_getvalue_int( "async" );
int nb = run_arg_get_int( args, "nb", 10 );
int ib = run_arg_get_int( args, "ib", 10 );
int P = parameters_getvalue_int( "P" );
cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper );
int N = run_arg_get_int( args, "N", 320 );
int seedA = run_arg_get_int( args, "seedA", random() );
int Q = parameters_compute_q( P );
/* Descriptors */
CHAM_desc_t *descA;
CHAMELEON_Set( CHAMELEON_TILE_SIZE, N );
/* Create the matrices */
parameters_desc_create( "A", &descA, ChamComplexDouble, N, N, nb * N, ib * N, nb * N, ib * N );
/* Fill the matrices with random values */
CHAMELEON_zplghe_batch_Tile( (double)N, descA, seedA );
/* Start measurement */
testing_start( &test_data );
if ( async ) {
fprintf( stderr, "Async unavailable yet\n" );
}
hres = CHAMELEON_zpotrf_batch_Tile( uplo, descA );
/* Stop measurement */
test_data.hres = hres;
testing_stop( &test_data, flops_zpotrf_batch( nb*ib, N ) );
/* Check the solution */
if ( check ) {
fprintf( stderr, "Check is not available for gemm_batch\n" );
}
parameters_desc_destroy( &descA );
return hres;
}
testing_t test_zpotrf_batch;
const char *zpotrf_batch_params[] = { "nb", "ib", "uplo", "n", "seedA", NULL };
const char *zpotrf_batch_output[] = { NULL };
const char *zpotrf_batch_outchk[] = { "RETURN", NULL };
/**
* @brief Testing registration function
*/
void testing_zpotrf_batch_init( void ) __attribute__( ( constructor ) );
void
testing_zpotrf_batch_init( void )
{
test_zpotrf_batch.name = "zpotrf_batch";
test_zpotrf_batch.helper = "Perform nb*ib Cholesky factorization potrf( uplo, N, ... )";
test_zpotrf_batch.params = zpotrf_batch_params;
test_zpotrf_batch.output = zpotrf_batch_output;
test_zpotrf_batch.outchk = zpotrf_batch_outchk;
test_zpotrf_batch.fptr_desc = testing_zpotrf_batch_desc;
test_zpotrf_batch.fptr_std = NULL;
test_zpotrf_batch.next = NULL;
testing_register( &test_zpotrf_batch );
}
/**
*
* @file testing_ztrsm_batch.c
*
* @copyright 2019-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon ztrsm_batch testing
*
* @version 1.3.0
* @author Mathieu Faverge
* @date 2024-04-03
* @precisions normal z -> c d s
*
*/
#include <chameleon.h>
#include <chameleon_lapack.h>
#include "testings.h"
#include "testing_zcheck.h"
#include <chameleon/flops.h>
#if defined(CHAMELEON_TESTINGS_VENDOR) || !defined(CHAMELEON_SIMULATION)
#include <coreblas.h>
#endif
static cham_fixdbl_t
flops_ztrsm_batch( int nb, cham_side_t side, int M, int N )
{
return flops_ztrsm( side, M, N ) * nb;
}
int
testing_ztrsm_batch_desc( run_arg_list_t *args, int check )
{
testdata_t test_data = { .args = args };
int hres = 0;
/* Read arguments */
int async = parameters_getvalue_int( "async" );
int nb = run_arg_get_int( args, "nb", 10 );
int ib = run_arg_get_int( args, "ib", 10 );
int P = parameters_getvalue_int( "P" );
cham_trans_t trans = run_arg_get_trans( args, "trans", ChamNoTrans );
cham_side_t side = run_arg_get_side( args, "side", ChamLeft );
cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper );
cham_diag_t diag = run_arg_get_diag( args, "diag", ChamNonUnit );
int N = run_arg_get_int( args, "N", 320 );
int M = run_arg_get_int( args, "M", N );
CHAMELEON_Complex64_t alpha = testing_zalea();
int seedA = run_arg_get_int( args, "seedA", random() );
int seedB = run_arg_get_int( args, "seedB", random() );
/* Descriptors */
int Am, An, Bm, Bn;
CHAM_desc_t *descA, *descB, *descC;
alpha = run_arg_get_complex64( args, "alpha", alpha );
CHAMELEON_Set( CHAMELEON_TILE_SIZE, N );
/* Calculate the dimensions according to the transposition */
if ( side == ChamLeft ) {
Am = M;
An = M;
}
else {
Am = N;
An = N;
}
/* Create the matrices */
parameters_desc_create( "A", &descA, ChamComplexDouble, Am, An, nb * Am, ib * An, nb * Am, ib * An );
parameters_desc_create( "B", &descB, ChamComplexDouble, M, N, nb * M, ib * N, nb * M, ib * N );
/* Fill the matrices with random values */
CHAMELEON_zplrnt_Tile( descA, seedA );
CHAMELEON_zplrnt_Tile( descB, seedB );
/* Start measurement */
testing_start( &test_data );
if ( async ) {
fprintf( stderr, "Async unavailable yet\n" );
}
hres = CHAMELEON_ztrsm_batch_Tile( side, uplo, trans, diag, alpha, descA, descB );
/* Stop measurement */
test_data.hres = hres;
testing_stop( &test_data, flops_ztrsm_batch( nb*ib, side, M, N ) );
/* Check the solution */
if ( check ) {
fprintf( stderr, "Check is not available for ztrsm_batch\n" );
}
parameters_desc_destroy( &descA );
parameters_desc_destroy( &descB );
return hres;
}
testing_t test_ztrsm_batch;
const char *ztrsm_batch_params[] = { "nb", "ib", "side", "uplo", "trans", "diag", "m", "n",
"alpha", "seedA", "seedB", NULL };
const char *ztrsm_batch_output[] = { NULL };
const char *ztrsm_batch_outchk[] = { "RETURN", NULL };
/**
* @brief Testing registration function
*/
void testing_ztrsm_batch_init( void ) __attribute__( ( constructor ) );
void
testing_ztrsm_batch_init( void )
{
test_ztrsm_batch.name = "ztrsm_batch";
test_ztrsm_batch.helper = "Perform nb*ib triangular solve trsm( side, uplo, trns, diag, M, N, ... )";
test_ztrsm_batch.params = ztrsm_batch_params;
test_ztrsm_batch.output = ztrsm_batch_output;
test_ztrsm_batch.outchk = ztrsm_batch_outchk;
test_ztrsm_batch.fptr_desc = testing_ztrsm_batch_desc;
test_ztrsm_batch.fptr_std = NULL;
test_ztrsm_batch.next = NULL;
testing_register( &test_ztrsm_batch );
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment