diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 61a9ae8a5d94685a71ab0e5758e64023f76397c7..0f1c8a8bc277674f7a46528ab43f4559cfab9fdc 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -185,7 +185,7 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) A, lda, T, ldt, C, ldc, WORK, ldwork); } -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -207,19 +207,24 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + CUDA_zunmlqt( side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork ); + A, lda, T, ldt, C, ldc, WORK, ldwork, stream ); - cudaThreadSynchronize(); -} +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); #endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, 0) #else CODELETS_CPU(zunmlq, 4, cl_zunmlq_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index e4e278fad8806aba04b55646b0eb1d70b15021b0..300f51d9b5f4fc99e8cd1813098e964585bb71b8 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -211,7 +211,7 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) A, lda, T, ldt, C, ldc, WORK, ldwork); } -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -233,19 +233,24 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + CUDA_zunmqrt( side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork ); + A, lda, T, ldt, C, ldc, WORK, ldwork, stream ); - cudaThreadSynchronize(); -} +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); #endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, 0) #else CODELETS_CPU(zunmqr, 4, cl_zunmqr_cpu_func)