diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index 669b2d1b49e023331771e7fc3ae6e2f2c0d4ce81..ed4f7dbe9303d36cef5385dd33cddb54b9583fd4 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -299,7 +299,6 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ - /* * Codelet definition */ diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c index 3368c6da7ecfb8bd00a7d269d98e88f487159144..6f2f600a8c83f1b299df8a951bddcb4abe019a7e 100644 --- a/runtime/starpu/codelets/codelet_zttmqr.c +++ b/runtime/starpu/codelets/codelet_zttmqr.c @@ -239,9 +239,59 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg) CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } + +#if defined(CHAMELEON_USE_CUDA) +static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + cuDoubleComplex *A1; + int lda1; + cuDoubleComplex *A2; + int lda2; + cuDoubleComplex *V; + int ldv; + cuDoubleComplex *T; + int ldt; + cuDoubleComplex *W, *WC; + int ldwork; + int ldworkc; + CUstream stream; + + A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + V = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); + W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ + + starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, + &lda1, &lda2, &ldv, &ldt, &ldwork); + + WC = W + ib * (side == MorseLeft ? m1 : n1); + ldworkc = (side == MorseLeft) ? m2 : ib; + + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + + CUDA_zttmqr( + side, trans, m1, n1, m2, n2, k, ib, + A1, lda1, A2, lda2, V, ldv, T, ldt, + W, ldwork, WC, ldworkc, stream ); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zttmqr, 5, cl_zttmqr_cpu_func) +CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC)