From 5715a0a813df0ba8b196a38bec14f488584859c1 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 7 Nov 2018 19:05:50 +0100 Subject: [PATCH] Update StarPU codelets --- runtime/starpu/codelets/codelet_ztpmlqt.c | 18 ++++++++----- runtime/starpu/codelets/codelet_ztpmqrt.c | 27 +++++++++++-------- runtime/starpu/codelets/codelet_ztsmlq.c | 25 ++++++++--------- .../starpu/codelets/codelet_ztsmlq_hetra1.c | 6 ++--- runtime/starpu/codelets/codelet_ztsmqr.c | 27 ++++++++++--------- runtime/starpu/codelets/codelet_zttmqr.c | 23 +++++++++------- 6 files changed, 71 insertions(+), 55 deletions(-) diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 714b45762..8dffa4ff2 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -11,7 +11,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-07 * @precisions normal z -> s d c * */ @@ -37,6 +37,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *B; int ldb; CHAMELEON_Complex64_t *WORK; + size_t lwork; V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -45,13 +46,15 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); CORE_ztpmlqt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, WORK ); + + (void)lwork; } -#if defined(CHAMELEON_USE_CUDA) && 0 +#if defined(CHAMELEON_USE_CUDA) static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) { cham_side_t side; @@ -70,6 +73,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *B; int ldb; cuDoubleComplex *W; + size_t lwork; V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -78,14 +82,14 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); RUNTIME_getStream(stream); CUDA_ztpmlqt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, - W, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -97,8 +101,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -CODELETS_CPU(ztpmlqt, 5, cl_ztpmlqt_cpu_func) -//CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC) +CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, @@ -136,6 +139,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, STARPU_VALUE, &lda, sizeof(int), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), STARPU_VALUE, &ldb, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 40f83ab2d..6684e59f8 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -11,7 +11,7 @@ * * @version 1.0.0 * @author Mathieu Faverge - * @date 2016-12-15 + * @date 2018-11-07 * @precisions normal z -> s d c * */ @@ -37,6 +37,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *B; int ldb; CHAMELEON_Complex64_t *WORK; + size_t lwork; V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -45,10 +46,12 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); CORE_ztpmqrt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, WORK ); + + (void)lwork; } @@ -71,22 +74,23 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *B; int ldb; cuDoubleComplex *W; + size_t lwork; V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ + W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 3*ib*nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, - &ldv, &ldt, &lda, &ldb ); + &ldv, &ldt, &lda, &ldb, &lwork ); RUNTIME_getStream(stream); CUDA_ztpmqrt( side, trans, M, N, K, L, ib, V, ldv, T, ldt, A, lda, B, ldb, - W, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -102,12 +106,12 @@ CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYN void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, - cham_side_t side, cham_trans_t trans, - int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + cham_side_t side, cham_trans_t trans, + int M, int N, int K, int L, int ib, int nb, + const CHAM_desc_t *V, int Vm, int Vn, int ldv, + const CHAM_desc_t *T, int Tm, int Tn, int ldt, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { struct starpu_codelet *codelet = &cl_ztpmqrt; void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL; @@ -136,6 +140,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, STARPU_VALUE, &lda, sizeof(int), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), STARPU_VALUE, &ldb, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c index b0a2e38ec..ad1d0a7a0 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ b/runtime/starpu/codelets/codelet_ztsmlq.c @@ -21,7 +21,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -165,9 +165,10 @@ void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options, STARPU_VALUE, &ldv, sizeof(int), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_VALUE, &ldt, sizeof(int), - /* max( ib*nb, 2*ib*nb ) */ + /* max( ib*nb, 3*ib*nb ) */ STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldwork, sizeof(int), + STARPU_VALUE, &ldwork, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) @@ -198,6 +199,7 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg) int ldt; CHAMELEON_Complex64_t *WORK; int ldwork; + size_t lwork; A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -206,10 +208,12 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork ); CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + + (void)lwork; } #if defined(CHAMELEON_USE_CUDA) @@ -231,9 +235,9 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) int ldv; cuDoubleComplex *T; int ldt; - cuDoubleComplex *W, *WC; + cuDoubleComplex *W; int ldwork; - int ldworkc; + size_t lwork; A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -241,17 +245,14 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * ldwork; - ldworkc = (side == ChamLeft) ? m1 : ib; + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork ); RUNTIME_getStream(stream); CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c index 8996121be..d68e2bebf 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c @@ -15,7 +15,7 @@ * @author Hatem Ltaief * @author Mathieu Faverge * @author Azzam Haidar - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -106,8 +106,8 @@ static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg) T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, - &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, + &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork); CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index c38a8fbd1..349aa129e 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -21,7 +21,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -169,6 +169,7 @@ void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options, /* max( ib*nb, 2*ib*nb ) */ STARPU_SCRATCH, options->ws_worker, STARPU_VALUE, &ldwork, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_USE_MPI) @@ -202,6 +203,7 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg) int ldt; CHAMELEON_Complex64_t *WORK; int ldwork; + size_t lwork; A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -210,10 +212,12 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork); CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + + (void)lwork; } #if defined(CHAMELEON_USE_CUDA) @@ -235,9 +239,9 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) int ldv; cuDoubleComplex *T; int ldt; - cuDoubleComplex *W, *WC; + cuDoubleComplex *W; int ldwork; - int ldworkc; + size_t lwork; A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -245,22 +249,21 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * (side == ChamLeft ? m1 : n1); - ldworkc = (side == ChamLeft) ? m2 : ib; + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork ); RUNTIME_getStream(stream); CUDA_ztsmqr( - side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); + side, trans, m1, n1, m2, n2, k, ib, + A1, lda1, A2, lda2, V, ldv, T, ldt, + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); #endif + + (void)ldwork; } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c index d485d16b9..0f88020fc 100644 --- a/runtime/starpu/codelets/codelet_zttmqr.c +++ b/runtime/starpu/codelets/codelet_zttmqr.c @@ -19,7 +19,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede - * @date 2010-11-15 + * @date 2018-11-07 * @precisions normal z -> c d s * */ @@ -167,6 +167,7 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options, /* max( ib*nb, 2*ib*nb ) */ STARPU_SCRATCH, options->ws_worker, STARPU_VALUE, &ldwork, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_USE_MPI) @@ -200,6 +201,7 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg) int ldt; CHAMELEON_Complex64_t *WORK; int ldwork; + size_t lwork; A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -208,10 +210,12 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg) WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork ); CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + + (void)lwork; } #if defined(CHAMELEON_USE_CUDA) @@ -233,9 +237,9 @@ static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg) int ldv; cuDoubleComplex *T; int ldt; - cuDoubleComplex *W, *WC; + cuDoubleComplex *W; int ldwork; - int ldworkc; + size_t lwork; A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -243,22 +247,21 @@ static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg) T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, - &lda1, &lda2, &ldv, &ldt, &ldwork); - - WC = W + ib * (side == ChamLeft ? m1 : n1); - ldworkc = (side == ChamLeft) ? m2 : ib; + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, + &lda1, &lda2, &ldv, &ldt, &ldwork, &lwork ); RUNTIME_getStream(stream); CUDA_zttmqr( side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); + W, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); #endif + + (void)ldwork; } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ -- GitLab