Newer
Older
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
* @version 1.3.0
* @author Hatem Ltaief
* @author Jakub Kurzak
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @date 2024-10-18
* @precisions normal z -> c d s
*

Mathieu Faverge
committed
*/

Mathieu Faverge
committed
#include "chameleon_starpu_internal.h"
struct cl_zsyr2k_args_s {
cham_uplo_t uplo;
cham_trans_t trans;
int n;
int k;
CHAMELEON_Complex64_t alpha;
CHAMELEON_Complex64_t beta;
};
#if !defined(CHAMELEON_SIMULATION)
static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
{
struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;

Mathieu Faverge
committed
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
TCORE_zsyr2k( clargs->uplo, clargs->trans,
clargs->n, clargs->k, clargs->alpha,
tileA, tileB, clargs->beta, tileC );
#if defined(CHAMELEON_USE_CUDA)
static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
{
cublasHandle_t handle = starpu_cublas_get_local_handle();
struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;

Mathieu Faverge
committed
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
CUDA_zsyr2k( clargs->uplo, clargs->trans,
clargs->n, clargs->k,
(cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld,
#endif /* defined(CHAMELEON_USE_CUDA) */
#if defined(CHAMELEON_USE_HIP)
static void cl_zsyr2k_hip_func(void *descr[], void *cl_arg)
{
hipblasHandle_t handle = starpu_hipblas_get_local_handle();
struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]);
HIP_zsyr2k( clargs->uplo, clargs->trans,
clargs->n, clargs->k,
(hipblasDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
tileB->mat, tileB->ld,
(hipblasDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld,
handle );
}
#endif /* defined(CHAMELEON_USE_HIP) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_HIP)
CODELETS_GPU( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_hip_func, STARPU_HIP_ASYNC )
#else
CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC )
#endif
#if defined(CHAMELEON_STARPU_USE_INSERT)
void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
cham_uplo_t uplo, cham_trans_t trans,
int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )

Mathieu Faverge
committed
if ( alpha == 0. ) {
INSERT_TASK_zlascal( options, uplo, n, n, nb,
beta, C, Cm, Cn );
return;

Mathieu Faverge
committed
}
void (*callback)(void*);
struct cl_zsyr2k_args_s *clargs = NULL;
int exec = 0;
const char *cl_name = "zsyr2k";
int accessC;
CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
CHAMELEON_ACCESS_RW(C, Cm, Cn);
exec = __chameleon_need_exec;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) );
clargs->uplo = uplo;
clargs->trans = trans;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->beta = beta;
}
/* Callback fro profiling information */
callback = options->profiling ? cl_zsyr2k_callback : NULL;
/* Reduce the C access if needed */
accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
/* Refine name */
cl_name = chameleon_codelet_name( cl_name, 3,
A->get_blktile( A, Am, An ),
B->get_blktile( B, Bm, Bn ),
C->get_blktile( C, Cm, Cn ) );

Mathieu Faverge
committed
rt_starpu_insert_task(
&cl_zsyr2k,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyr2k_args_s),
STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An),
STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
STARPU_POSSIBLY_PARALLEL, options->parallel,
STARPU_NAME, cl_name,
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#else
void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
cham_uplo_t uplo, cham_trans_t trans,
int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
if ( alpha == 0. ) {
INSERT_TASK_zlascal( options, uplo, n, n, nb,
beta, C, Cm, Cn );
return;
}
INSERT_TASK_COMMON_PARAMETERS( zsyr2k, 3 );
int accessC;
/* Reduce the C access if needed */
accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
/*
* Set the data handles and initialize exchanges if needed
*/
starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R );
starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC );
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/*
* Not involved, let's return
*/
if ( nbdata == 0 ) {
return;
}
if ( params.do_execute )
{
int ret;
struct starpu_task *task = starpu_task_create();
task->cl = cl;
/* Set codelet parameters */
clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) );
clargs->uplo = uplo;
clargs->trans = trans;
clargs->n = n;
clargs->k = k;
clargs->alpha = alpha;
clargs->beta = beta;
task->cl_arg = clargs;
task->cl_arg_size = sizeof( struct cl_zsyr2k_args_s );
task->cl_arg_free = 1;
/* Set common parameters */
starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsyr2k_callback );
/* Flops */
task->flops = flops_zsyr2k( k, n );
/* Refine name */
task->name = chameleon_codelet_name( cl_name, 3,
A->get_blktile( A, Am, An ),
B->get_blktile( B, Bm, Bn ),
C->get_blktile( C, Cm, Cn ) );
ret = starpu_task_submit( task );
if ( ret == -ENODEV ) {
task->destroy = 0;
starpu_task_destroy( task );
chameleon_error( "INSERT_TASK_zsyr2k", "Failed to submit the task to StarPU" );
return;
}
}
starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
(void)nb;
}
#endif