Newer
Older
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
* @author Hatem Ltaief
* @author Jakub Kurzak
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @author Gwenole Lucas
* @author Loris Lucido
* @author Terry Cojean
* @precisions normal z -> c d s
*

Mathieu Faverge
committed
*/

Mathieu Faverge
committed
#include "chameleon_starpu_internal.h"
#if !defined(CHAMELEON_SIMULATION)
static void
cl_zgemm_cpu_func( void *descr[], void *cl_arg )
{
struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;

Mathieu Faverge
committed
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
assert( tileA->flttype == ChamComplexDouble );
assert( tileB->flttype == ChamComplexDouble );
assert( tileC->flttype == ChamComplexDouble );
TCORE_zgemm( clargs->transA, clargs->transB,
clargs->m, clargs->n, clargs->k,
clargs->alpha, tileA, tileB,
clargs->beta, tileC );
#if defined(CHAMELEON_USE_CUDA)
cl_zgemm_cuda_func( void *descr[], void *cl_arg )
struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
cublasHandle_t handle = starpu_cublas_get_local_handle();

Mathieu Faverge
committed
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;

Mathieu Faverge
committed
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
assert( tileA->format & CHAMELEON_TILE_FULLRANK );
assert( tileB->format & CHAMELEON_TILE_FULLRANK );
assert( tileC->format & CHAMELEON_TILE_FULLRANK );

PRUVOST Florent
committed
CUDA_zgemm(
clargs->transA, clargs->transB,
clargs->m, clargs->n, clargs->k,
(cuDoubleComplex*)&(clargs->alpha),
tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(cuDoubleComplex*)&(clargs->beta),
tileC->mat, tileC->ld,
#endif /* defined(CHAMELEON_USE_CUDA) */
#if defined(CHAMELEON_USE_HIP)
static void
cl_zgemm_hip_func( void *descr[], void *cl_arg )
{
struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
hipblasHandle_t handle = starpu_hipblas_get_local_handle();
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
assert( tileA->format & CHAMELEON_TILE_FULLRANK );
assert( tileB->format & CHAMELEON_TILE_FULLRANK );
assert( tileC->format & CHAMELEON_TILE_FULLRANK );
HIP_zgemm(
clargs->transA, clargs->transB,
clargs->m, clargs->n, clargs->k,
(hipblasDoubleComplex*)&(clargs->alpha),
tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(hipblasDoubleComplex*)&(clargs->beta),
tileC->mat, tileC->ld,
handle );
return;
}
#endif /* defined(CHAMELEON_USE_HIP) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_HIP)
CODELETS_GPU( zgemm, cl_zgemm_cpu_func, cl_zgemm_hip_func, STARPU_HIP_ASYNC )
#else
CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
#if defined(CHAMELEON_STARPU_USE_INSERT)
void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
if ( alpha == 0. ) {
INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn );
return;
}
void (*callback)(void*);
struct cl_zgemm_args_s *clargs = NULL;
int exec = 0;
const char *cl_name = "zgemm_Astat";
uint32_t where = cl_zgemm.where;
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
/* Check A as write, since it will be the owner of the computation */
CHAMELEON_ACCESS_W(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
/* Check C as read, since it will be used in a reduction */
CHAMELEON_ACCESS_R(C, Cm, Cn);
exec = __chameleon_need_exec;
CHAMELEON_END_ACCESS_DECLARATION;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
clargs->transA = transA;
clargs->transB = transB;
clargs->m = m;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->beta = beta;
}
/* Callback for profiling information */
callback = options->profiling ? cl_zgemm_callback : NULL;
/* Reduce the C access if needed */
if ( beta == 0. ) {
accessC = STARPU_W;
}
#if defined(HAVE_STARPU_MPI_REDUX)
else if ( beta == 1. ) {
accessC = STARPU_MPI_REDUX;
}
#endif
else {
accessC = STARPU_RW;
}

Mathieu Faverge
committed
/* Refine name */
cl_name = chameleon_codelet_name( cl_name, 3,
A->get_blktile( A, Am, An ),
B->get_blktile( B, Bm, Bn ),
C->get_blktile( C, Cm, Cn ) );
/* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
#if defined(PRECISION_z) || defined(PRECISION_c)
if ( k == 1 ) {
where = STARPU_CPU;
}
#endif
/* Insert the task */
rt_starpu_insert_task(
&cl_zgemm,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
/* Task handles */
STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An),
STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An),
STARPU_NAME, cl_name,
STARPU_EXECUTE_WHERE, where,
0 );
}
void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
if ( alpha == (CHAMELEON_Complex64_t)0. ) {
INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn );
return;

Mathieu Faverge
committed
}
void (*callback)(void*);
struct cl_zgemm_args_s *clargs = NULL;
int exec = 0;
const char *cl_name = "zgemm";
uint32_t where = cl_zgemm.where;
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
CHAMELEON_ACCESS_RW(C, Cm, Cn);
exec = __chameleon_need_exec;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
clargs->transA = transA;
clargs->transB = transB;
clargs->m = m;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->beta = beta;
}

Philippe SWARTVAGHER
committed
/* Callback for profiling information */
callback = options->profiling ? cl_zgemm_callback : NULL;
/* Reduce the C access if needed */
accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
(STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));

Mathieu Faverge
committed
/* Refine name */
cl_name = chameleon_codelet_name( cl_name, 3,
A->get_blktile( A, Am, An ),
B->get_blktile( B, Bm, Bn ),
C->get_blktile( C, Cm, Cn ) );
/* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
#if defined(PRECISION_z) || defined(PRECISION_c)
if ( k == 1 ) {
where = STARPU_CPU;
}
#endif
/* Insert the task */

Mathieu Faverge
committed
rt_starpu_insert_task(
&cl_zgemm,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
/* Task handles */
STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An),
STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
COJEAN Terry
committed
STARPU_POSSIBLY_PARALLEL, options->parallel,
STARPU_NAME, cl_name,
STARPU_EXECUTE_WHERE, where,
#else
void __INSERT_TASK_zgemm( const RUNTIME_option_t *options,
int xrank, int accessC,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
if ( alpha == (CHAMELEON_Complex64_t)0. ) {
INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn );
return;
}
INSERT_TASK_COMMON_PARAMETERS( zgemm, 3 );
/*
* Register the data handles and initialize exchanges if needed
*/
starpu_cham_exchange_init_params( options, ¶ms, xrank );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC );
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/*
* Not involved, let's return
*/
if ( nbdata == 0 ) {
return;
}
if ( params.do_execute )
{
int ret;
struct starpu_task *task = starpu_task_create();
task->cl = cl;
/* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
#if defined(PRECISION_z) || defined(PRECISION_c)
if ( k == 1 ) {
task->where = STARPU_CPU;
}
#endif
/* Set codelet parameters */
clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
clargs->transA = transA;
clargs->transB = transB;
clargs->m = m;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->beta = beta;
task->cl_arg = clargs;
task->cl_arg_size = sizeof( struct cl_zgemm_args_s );
task->cl_arg_free = 1;
/* Set common parameters */
starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgemm_callback );
/* Flops */
task->flops = flops_zgemm( m, n, k );
/* Refine name */
task->name = chameleon_codelet_name( cl_name, 3,
A->get_blktile( A, Am, An ),
B->get_blktile( B, Bm, Bn ),
C->get_blktile( C, Cm, Cn ) );
ret = starpu_task_submit( task );
if ( ret == -ENODEV ) {
task->destroy = 0;
starpu_task_destroy( task );
chameleon_error( "INSERT_TASK_zgemm", "Failed to submit the task to StarPU" );
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
return;
}
}
starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
(void)nb;
}
void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
/* Reduce the C access if needed */
int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
#if defined(HAVE_STARPU_MPI_REDUX)
if ( beta == (CHAMELEON_Complex64_t)1. ) {
accessC = STARPU_MPI_REDUX;
}
#endif
__INSERT_TASK_zgemm( options,
A->get_rankof( A, Am, An ), accessC,
transA, transB, m, n, k, nb,
alpha, A, Am, An,
B, Bm, Bn,
beta, C, Cm, Cn );
}
void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
/* Reduce the C access if needed */
int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
(STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));
__INSERT_TASK_zgemm( options,
C->get_rankof( C, Cm, Cn ), accessC,
transA, transB, m, n, k, nb,
alpha, A, Am, An,
B, Bm, Bn,
beta, C, Cm, Cn );
}
#endif