Commit 5715a0a8 authored by Mathieu Faverge's avatar Mathieu Faverge

Update StarPU codelets

parent c69e5f0a
......@@ -11,7 +11,7 @@
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2016-12-15
* @date 2018-11-07
* @precisions normal z -> s d c
*
*/
......@@ -37,6 +37,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
CHAMELEON_Complex64_t *B;
int ldb;
CHAMELEON_Complex64_t *WORK;
size_t lwork;
V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -45,13 +46,15 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
&ldv, &ldt, &lda, &ldb );
&ldv, &ldt, &lda, &ldb, &lwork );
CORE_ztpmlqt( side, trans, M, N, K, L, ib,
V, ldv, T, ldt, A, lda, B, ldb, WORK );
(void)lwork;
}
#if defined(CHAMELEON_USE_CUDA) && 0
#if defined(CHAMELEON_USE_CUDA)
static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
{
cham_side_t side;
......@@ -70,6 +73,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
cuDoubleComplex *B;
int ldb;
cuDoubleComplex *W;
size_t lwork;
V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -78,14 +82,14 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
&ldv, &ldt, &lda, &ldb );
&ldv, &ldt, &lda, &ldb, &lwork );
RUNTIME_getStream(stream);
CUDA_ztpmlqt(
side, trans, M, N, K, L, ib,
V, ldv, T, ldt, A, lda, B, ldb,
W, stream );
W, lwork, stream );
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
......@@ -97,8 +101,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
/*
* Codelet definition
*/
CODELETS_CPU(ztpmlqt, 5, cl_ztpmlqt_cpu_func)
//CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC)
CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC)
void
INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
......@@ -136,6 +139,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
STARPU_VALUE, &lda, sizeof(int),
STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
STARPU_VALUE, &ldb, sizeof(int),
STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
/* Other options */
STARPU_SCRATCH, options->ws_worker,
STARPU_PRIORITY, options->priority,
......
......@@ -11,7 +11,7 @@
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2016-12-15
* @date 2018-11-07
* @precisions normal z -> s d c
*
*/
......@@ -37,6 +37,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
CHAMELEON_Complex64_t *B;
int ldb;
CHAMELEON_Complex64_t *WORK;
size_t lwork;
V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -45,10 +46,12 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
&ldv, &ldt, &lda, &ldb );
&ldv, &ldt, &lda, &ldb, &lwork );
CORE_ztpmqrt( side, trans, M, N, K, L, ib,
V, ldv, T, ldt, A, lda, B, ldb, WORK );
(void)lwork;
}
......@@ -71,22 +74,23 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
cuDoubleComplex *B;
int ldb;
cuDoubleComplex *W;
size_t lwork;
V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 3*ib*nb */
starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
&ldv, &ldt, &lda, &ldb );
&ldv, &ldt, &lda, &ldb, &lwork );
RUNTIME_getStream(stream);
CUDA_ztpmqrt(
side, trans, M, N, K, L, ib,
V, ldv, T, ldt, A, lda, B, ldb,
W, stream );
W, lwork, stream );
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
......@@ -102,12 +106,12 @@ CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYN
void
INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int ib, int nb,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt,
const CHAM_desc_t *A, int Am, int An, int lda,
const CHAM_desc_t *B, int Bm, int Bn, int ldb )
cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int ib, int nb,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt,
const CHAM_desc_t *A, int Am, int An, int lda,
const CHAM_desc_t *B, int Bm, int Bn, int ldb )
{
struct starpu_codelet *codelet = &cl_ztpmqrt;
void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL;
......@@ -136,6 +140,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
STARPU_VALUE, &lda, sizeof(int),
STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
STARPU_VALUE, &ldb, sizeof(int),
STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
/* Other options */
STARPU_SCRATCH, options->ws_worker,
STARPU_PRIORITY, options->priority,
......
......@@ -21,7 +21,7 @@
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @date 2010-11-15
* @date 2018-11-07
* @precisions normal z -> c d s
*
*/
......@@ -165,9 +165,10 @@ void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
STARPU_VALUE, &ldv, sizeof(int),
STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
STARPU_VALUE, &ldt, sizeof(int),
/* max( ib*nb, 2*ib*nb ) */
/* max( ib*nb, 3*ib*nb ) */
STARPU_SCRATCH, options->ws_worker,
STARPU_VALUE, &ldwork, sizeof(int),
STARPU_VALUE, &ldwork, sizeof(int),
STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
......@@ -198,6 +199,7 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
int ldt;
CHAMELEON_Complex64_t *WORK;
int ldwork;
size_t lwork;
A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -206,10 +208,12 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork );
CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
(void)lwork;
}
#if defined(CHAMELEON_USE_CUDA)
......@@ -231,9 +235,9 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
int ldv;
cuDoubleComplex *T;
int ldt;
cuDoubleComplex *W, *WC;
cuDoubleComplex *W;
int ldwork;
int ldworkc;
size_t lwork;
A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -241,17 +245,14 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
WC = W + ib * ldwork;
ldworkc = (side == ChamLeft) ? m1 : ib;
starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork );
RUNTIME_getStream(stream);
CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt,
W, ldwork, WC, ldworkc, stream );
W, lwork, stream );
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
......
......@@ -15,7 +15,7 @@
* @author Hatem Ltaief
* @author Mathieu Faverge
* @author Azzam Haidar
* @date 2010-11-15
* @date 2018-11-07
* @precisions normal z -> c d s
*
*/
......@@ -106,8 +106,8 @@ static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg)
T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k,
&ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork);
starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k,
&ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork);
CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
}
......
......@@ -21,7 +21,7 @@
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @date 2010-11-15
* @date 2018-11-07
* @precisions normal z -> c d s
*
*/
......@@ -169,6 +169,7 @@ void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
/* max( ib*nb, 2*ib*nb ) */
STARPU_SCRATCH, options->ws_worker,
STARPU_VALUE, &ldwork, sizeof(int),
STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
#if defined(CHAMELEON_USE_MPI)
......@@ -202,6 +203,7 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
int ldt;
CHAMELEON_Complex64_t *WORK;
int ldwork;
size_t lwork;
A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -210,10 +212,12 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork);
CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
(void)lwork;
}
#if defined(CHAMELEON_USE_CUDA)
......@@ -235,9 +239,9 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
int ldv;
cuDoubleComplex *T;
int ldt;
cuDoubleComplex *W, *WC;
cuDoubleComplex *W;
int ldwork;
int ldworkc;
size_t lwork;
A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -245,22 +249,21 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
WC = W + ib * (side == ChamLeft ? m1 : n1);
ldworkc = (side == ChamLeft) ? m2 : ib;
starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork );
RUNTIME_getStream(stream);
CUDA_ztsmqr(
side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt,
W, ldwork, WC, ldworkc, stream );
side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt,
W, lwork, stream );
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
(void)ldwork;
}
#endif /* defined(CHAMELEON_USE_CUDA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
......
......@@ -19,7 +19,7 @@
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Cedric Castagnede
* @date 2010-11-15
* @date 2018-11-07
* @precisions normal z -> c d s
*
*/
......@@ -167,6 +167,7 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
/* max( ib*nb, 2*ib*nb ) */
STARPU_SCRATCH, options->ws_worker,
STARPU_VALUE, &ldwork, sizeof(int),
STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
#if defined(CHAMELEON_USE_MPI)
......@@ -200,6 +201,7 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg)
int ldt;
CHAMELEON_Complex64_t *WORK;
int ldwork;
size_t lwork;
A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -208,10 +210,12 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg)
WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork );
CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
(void)lwork;
}
#if defined(CHAMELEON_USE_CUDA)
......@@ -233,9 +237,9 @@ static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg)
int ldv;
cuDoubleComplex *T;
int ldt;
cuDoubleComplex *W, *WC;
cuDoubleComplex *W;
int ldwork;
int ldworkc;
size_t lwork;
A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
......@@ -243,22 +247,21 @@ static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg)
T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork);
WC = W + ib * (side == ChamLeft ? m1 : n1);
ldworkc = (side == ChamLeft) ? m2 : ib;
starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
&lda1, &lda2, &ldv, &ldt, &ldwork, &lwork );
RUNTIME_getStream(stream);
CUDA_zttmqr(
side, trans, m1, n1, m2, n2, k, ib,
A1, lda1, A2, lda2, V, ldv, T, ldt,
W, ldwork, WC, ldworkc, stream );
W, lwork, stream );
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
(void)ldwork;
}
#endif /* defined(CHAMELEON_USE_CUDA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment