Newer
Older
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.

Mathieu Faverge
committed
* @version 1.2.0
* @author Hatem Ltaief
* @author Jakub Kurzak
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @author Gwenole Lucas

Mathieu Faverge
committed
* @date 2022-02-22
* @precisions normal z -> c d s
*

Mathieu Faverge
committed
*/
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
struct cl_zgemm_args_s {
cham_trans_t transA;
cham_trans_t transB;
int m;
int n;
int k;

Mathieu Faverge
committed
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;

Mathieu Faverge
committed
CHAM_tile_t *tileC;
};
#if !defined(CHAMELEON_SIMULATION)
static void
cl_zgemm_cpu_func( void *descr[], void *cl_arg )
{
struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;

Mathieu Faverge
committed
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
TCORE_zgemm( clargs->transA, clargs->transB,
clargs->m, clargs->n, clargs->k,
clargs->alpha, tileA, tileB,
clargs->beta, tileC );
#ifdef CHAMELEON_USE_CUDA
cl_zgemm_cuda_func( void *descr[], void *cl_arg )
struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
cublasHandle_t handle = starpu_cublas_get_local_handle();

Mathieu Faverge
committed
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;

Mathieu Faverge
committed
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
assert( tileA->format & CHAMELEON_TILE_FULLRANK );
assert( tileB->format & CHAMELEON_TILE_FULLRANK );
assert( tileC->format & CHAMELEON_TILE_FULLRANK );

PRUVOST Florent
committed
CUDA_zgemm(
clargs->transA, clargs->transB,
clargs->m, clargs->n, clargs->k,
(cuDoubleComplex*)&(clargs->alpha),
tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(cuDoubleComplex*)&(clargs->beta),
tileC->mat, tileC->ld,
#endif /* defined(CHAMELEON_USE_CUDA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
if ( alpha == 0. ) {
return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn );
}
struct cl_zgemm_args_s *clargs = NULL;
void (*callback)(void*);
int accessC;
int exec = 0;
char *cl_name = "zgemm_Astat";
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
/* Check A as write, since it will be the owner of the computation */
CHAMELEON_ACCESS_W(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
/* Check C as read, since it will be used in a reduction */
CHAMELEON_ACCESS_R(C, Cm, Cn);
exec = __chameleon_need_exec;
CHAMELEON_END_ACCESS_DECLARATION;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
clargs->transA = transA;
clargs->transB = transB;
clargs->m = m;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->tileA = A->get_blktile( A, Am, An );
clargs->tileB = B->get_blktile( B, Bm, Bn );
clargs->beta = beta;
clargs->tileC = C->get_blktile( C, Cm, Cn );
}
/* Callback for profiling information */
callback = options->profiling ? cl_zgemm_callback : NULL;
/* Reduce the C access if needed */
if ( beta == 0. ) {
accessC = STARPU_W;
}
#if defined(HAVE_STARPU_MPI_REDUX)
else if ( beta == 1. ) {
accessC = STARPU_MPI_REDUX;
}
#endif
else {
accessC = STARPU_RW;
}
#if defined(CHAMELEON_KERNELS_TRACE)
{
char *cl_fullname;
chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
cl_name = cl_fullname;
}
#endif
/* Insert the task */
rt_starpu_insert_task(
&cl_zgemm,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
/* Task handles */
STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An),
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, cl_name,
#endif
0 );
}
void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )

Mathieu Faverge
committed
if ( alpha == 0. ) {
return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn );
}
struct cl_zgemm_args_s *clargs = NULL;
void (*callback)(void*);
int accessC;
char *cl_name = "zgemm";
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
CHAMELEON_ACCESS_RW(C, Cm, Cn);
exec = __chameleon_need_exec;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
clargs->transA = transA;
clargs->transB = transB;
clargs->m = m;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->tileA = A->get_blktile( A, Am, An );
clargs->tileB = B->get_blktile( B, Bm, Bn );
clargs->beta = beta;
clargs->tileC = C->get_blktile( C, Cm, Cn );
}

Philippe SWARTVAGHER
committed
/* Callback for profiling information */
callback = options->profiling ? cl_zgemm_callback : NULL;
/* Reduce the C access if needed */
accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
#if defined(CHAMELEON_KERNELS_TRACE)
{
char *cl_fullname;
chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
cl_name = cl_fullname;
}
#endif
/* Insert the task */

Mathieu Faverge
committed
rt_starpu_insert_task(
&cl_zgemm,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
/* Task handles */
STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, cl_name,