Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e83d2772 authored by PERESSONI Romain's avatar PERESSONI Romain Committed by Mathieu Faverge
Browse files

codelet_zsymm: Add stationnary A variant of INSERT_TASK_zsymm

parent 7cd326d1
No related branches found
No related tags found
1 merge request!336Add the A-stationnary symm and hemm (Works only for side == ChamLeft for now)
...@@ -210,6 +210,12 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, ...@@ -210,6 +210,12 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn, const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
cham_side_t side, cham_uplo_t uplo,
int m, int n, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
cham_uplo_t uplo, cham_trans_t trans, cham_uplo_t uplo, cham_trans_t trans,
int n, int k, int nb, int n, int k, int nb,
......
...@@ -20,23 +20,37 @@ ...@@ -20,23 +20,37 @@
#include "chameleon/tasks_z.h" #include "chameleon/tasks_z.h"
#include "coreblas/coreblas_ztile.h" #include "coreblas/coreblas_ztile.h"
void INSERT_TASK_zsymm( const RUNTIME_option_t *options, void
cham_side_t side, cham_uplo_t uplo, INSERT_TASK_zsymm( const RUNTIME_option_t *options,
int m, int n, int nb, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int m, int n, int nb,
const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{ {
CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn );
#pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] )
TCORE_zsymm( side, uplo, TCORE_zsymm( side, uplo,
m, n, m, n,
alpha, tileA, alpha, tileA,
tileB, tileB,
beta, tileC ); beta, tileC );
(void)options; (void)options;
(void)nb; (void)nb;
} }
void
INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
cham_side_t side, cham_uplo_t uplo,
int m, int n, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
static inline int static inline int
CORE_zsymm_parsec( parsec_execution_stream_t *context, CORE_zsymm_parsec( parsec_execution_stream_t *context,
parsec_task_t *this_task ) parsec_task_t *this_task )
{ {
cham_side_t side; cham_side_t side;
cham_uplo_t uplo; cham_uplo_t uplo;
...@@ -45,18 +45,19 @@ CORE_zsymm_parsec( parsec_execution_stream_t *context, ...@@ -45,18 +45,19 @@ CORE_zsymm_parsec( parsec_execution_stream_t *context,
CORE_zsymm( side, uplo, M, N, CORE_zsymm( side, uplo, M, N,
alpha, A, LDA, alpha, A, LDA,
B, LDB, B, LDB,
beta, C, LDC); beta, C, LDC );
(void)context; (void)context;
return PARSEC_HOOK_RETURN_DONE; return PARSEC_HOOK_RETURN_DONE;
} }
void INSERT_TASK_zsymm(const RUNTIME_option_t *options, void
cham_side_t side, cham_uplo_t uplo, INSERT_TASK_zsymm( const RUNTIME_option_t *options,
int m, int n, int nb, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int m, int n, int nb,
const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{ {
parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
...@@ -81,3 +82,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, ...@@ -81,3 +82,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
(void)nb; (void)nb;
} }
void
INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
cham_side_t side, cham_uplo_t uplo,
int m, int n, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
...@@ -30,28 +30,28 @@ void CORE_zsymm_quark(Quark *quark) ...@@ -30,28 +30,28 @@ void CORE_zsymm_quark(Quark *quark)
{ {
cham_side_t side; cham_side_t side;
cham_uplo_t uplo; cham_uplo_t uplo;
int M; int m;
int N; int n;
CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t alpha;
CHAM_tile_t *tileA; CHAM_tile_t *tileA;
CHAM_tile_t *tileB; CHAM_tile_t *tileB;
CHAMELEON_Complex64_t beta; CHAMELEON_Complex64_t beta;
CHAM_tile_t *tileC; CHAM_tile_t *tileC;
quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC);
TCORE_zsymm(side, uplo, TCORE_zsymm( side, uplo,
M, N, m, n,
alpha, tileA, alpha, tileA, tileB,
tileB, beta, tileC );
beta, tileC);
} }
void INSERT_TASK_zsymm(const RUNTIME_option_t *options, void
cham_side_t side, cham_uplo_t uplo, INSERT_TASK_zsymm( const RUNTIME_option_t *options,
int m, int n, int nb, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int m, int n, int nb,
const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{ {
if ( alpha == 0. ) { if ( alpha == 0. ) {
return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
...@@ -74,3 +74,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, ...@@ -74,3 +74,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC,
0); 0);
} }
void
INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
cham_side_t side, cham_uplo_t uplo,
int m, int n, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
...@@ -26,111 +26,229 @@ ...@@ -26,111 +26,229 @@
#include "chameleon_starpu.h" #include "chameleon_starpu.h"
#include "runtime_codelet_z.h" #include "runtime_codelet_z.h"
#if !defined(CHAMELEON_SIMULATION) struct cl_zsymm_args_s {
static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
{
cham_side_t side; cham_side_t side;
cham_uplo_t uplo; cham_uplo_t uplo;
int M; int m;
int N; int n;
CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t alpha;
CHAM_tile_t *tileA; CHAM_tile_t *tileA;
CHAM_tile_t *tileB; CHAM_tile_t *tileB;
CHAMELEON_Complex64_t beta; CHAMELEON_Complex64_t beta;
CHAM_tile_t *tileC; CHAM_tile_t *tileC;
};
#if !defined(CHAMELEON_SIMULATION)
static void
cl_zsymm_cpu_func( void *descr[], void *cl_arg )
{
struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg;
CHAM_tile_t *tileA;
CHAM_tile_t *tileB;
CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]); tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]); tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]); tileC = cti_interface_get(descr[2]);
starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); TCORE_zsymm( clargs->side, clargs->uplo,
TCORE_zsymm(side, uplo, clargs->m, clargs->n,
M, N, clargs->alpha, tileA, tileB,
alpha, tileA, clargs->beta, tileC );
tileB,
beta, tileC);
} }
#ifdef CHAMELEON_USE_CUDA #ifdef CHAMELEON_USE_CUDA
static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) static void
cl_zsymm_cuda_func( void *descr[], void *cl_arg )
{ {
cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg;
cham_side_t side; cublasHandle_t handle = starpu_cublas_get_local_handle();
cham_uplo_t uplo;
int M;
int N;
cuDoubleComplex alpha;
CHAM_tile_t *tileA; CHAM_tile_t *tileA;
CHAM_tile_t *tileB; CHAM_tile_t *tileB;
cuDoubleComplex beta;
CHAM_tile_t *tileC; CHAM_tile_t *tileC;
tileA = cti_interface_get(descr[0]); tileA = cti_interface_get(descr[0]);
tileB = cti_interface_get(descr[1]); tileB = cti_interface_get(descr[1]);
tileC = cti_interface_get(descr[2]); tileC = cti_interface_get(descr[2]);
starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); assert( tileA->format & CHAMELEON_TILE_FULLRANK );
assert( tileB->format & CHAMELEON_TILE_FULLRANK );
assert( tileC->format & CHAMELEON_TILE_FULLRANK );
CUDA_zsymm( CUDA_zsymm(
side, uplo, clargs->side, clargs->uplo,
M, N, clargs->m, clargs->n,
&alpha, tileA->mat, tileA->ld, (cuDoubleComplex*)&(clargs->alpha),
tileB->mat, tileB->ld, tileA->mat, tileA->ld,
&beta, tileC->mat, tileC->ld, tileB->mat, tileB->ld,
(cuDoubleComplex*)&(clargs->beta),
tileC->mat, tileC->ld,
handle ); handle );
} }
#endif /* CHAMELEON_USE_CUDA */ #endif /* defined(CHAMELEON_USE_CUDA) */
#endif /* !defined(CHAMELEON_SIMULATION) */ #endif /* !defined(CHAMELEON_SIMULATION) */
/* /*
* Codelet definition * Codelet definition
*/ */
CODELETS(zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC) CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC )
/** void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
* cham_side_t side, cham_uplo_t uplo,
* @ingroup INSERT_TASK_Complex64_t int m, int n, int nb,
* CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
*/ const CHAM_desc_t *B, int Bm, int Bn,
void INSERT_TASK_zsymm(const RUNTIME_option_t *options, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
cham_side_t side, cham_uplo_t uplo, {
int m, int n, int nb, if ( alpha == 0. ) {
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
const CHAM_desc_t *B, int Bm, int Bn, beta, C, Cm, Cn );
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) }
struct cl_zsymm_args_s *clargs = NULL;
void (*callback)(void*);
int accessC;
int exec = 0;
char *cl_name = "zsymm_Astat";
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
/* Check A as write, since it will be the owner of the computation */
CHAMELEON_ACCESS_W(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn);
/* Check C as read, since it will be used in a reduction */
CHAMELEON_ACCESS_R(C, Cm, Cn);
exec = __chameleon_need_exec;
CHAMELEON_END_ACCESS_DECLARATION;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zsymm_args_s ) );
clargs->side = side;
clargs->uplo = uplo;
clargs->m = m;
clargs->n = n;
clargs->alpha = alpha;
clargs->tileA = A->get_blktile( A, Am, An );
clargs->tileB = B->get_blktile( B, Bm, Bn );
clargs->beta = beta;
clargs->tileC = C->get_blktile( C, Cm, Cn );
}
/* Callback for profiling information */
callback = options->profiling ? cl_zsymm_callback : NULL;
/* Reduce the C access if needed */
if ( beta == 0. ) {
accessC = STARPU_W;
}
#if defined(HAVE_STARPU_MPI_REDUX)
else if ( beta == 1. ) {
accessC = STARPU_MPI_REDUX;
}
#endif
else {
accessC = STARPU_RW;
}
#if defined(CHAMELEON_KERNELS_TRACE)
{
char *cl_fullname;
chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
cl_name = cl_fullname;
}
#endif
/* Insert the task */
rt_starpu_insert_task(
&cl_zsymm,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s),
/* Task handles */
STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
/* Common task arguments */
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An),
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, cl_name,
#endif
0 );
}
void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
cham_side_t side, cham_uplo_t uplo,
int m, int n, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{ {
if ( alpha == 0. ) { if ( alpha == 0. ) {
return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
beta, C, Cm, Cn ); beta, C, Cm, Cn );
} }
(void)nb; struct cl_zsymm_args_s *clargs = NULL;
struct starpu_codelet *codelet = &cl_zsymm; void (*callback)(void*);
void (*callback)(void*) = options->profiling ? cl_zsymm_callback : NULL; int accessC;
int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; int exec = 0;
char *cl_name = "zsymm";
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(A, Am, An);
CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_R(B, Bm, Bn);
CHAMELEON_ACCESS_RW(C, Cm, Cn); CHAMELEON_ACCESS_RW(C, Cm, Cn);
exec = __chameleon_need_exec;
CHAMELEON_END_ACCESS_DECLARATION; CHAMELEON_END_ACCESS_DECLARATION;
if ( exec ) {
clargs = malloc( sizeof( struct cl_zsymm_args_s ) );
clargs->side = side;
clargs->uplo = uplo;
clargs->m = m;
clargs->n = n;
clargs->alpha = alpha;
clargs->tileA = A->get_blktile( A, Am, An );
clargs->tileB = B->get_blktile( B, Bm, Bn );
clargs->beta = beta;
clargs->tileC = C->get_blktile( C, Cm, Cn );
}
/* Callback for profiling information */
callback = options->profiling ? cl_zsymm_callback : NULL;
/* Reduce the C access if needed */
accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
#if defined(CHAMELEON_KERNELS_TRACE)
{
char *cl_fullname;
chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
cl_name = cl_fullname;
}
#endif
/* Insert the task */
rt_starpu_insert_task( rt_starpu_insert_task(
codelet, &cl_zsymm,
STARPU_VALUE, &side, sizeof(int), /* Task codelet arguments */
STARPU_VALUE, &uplo, sizeof(int), STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s),
STARPU_VALUE, &m, sizeof(int),
STARPU_VALUE, &n, sizeof(int), /* Task handles */
STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t),
accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), /* Common task arguments */
STARPU_PRIORITY, options->priority, STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback, STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME) #if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, "zsymm", STARPU_NAME, cl_name,
#endif #endif
0); 0 );
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment