diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index e26da4f036483a469d9b16b3d51b0a61d62bb094..ce3cbd1dba54d21a3834cac5308c42d6c8bec30a 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -77,7 +77,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, */ ws_worker = A->nb * ib; -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) /* Worker space * * zunmlq = A->nb * ib @@ -118,7 +118,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -230,7 +230,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -307,7 +307,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -346,7 +346,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); -#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUDA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index 99edcbe4adfe7e3655c2d4302854979354cfcdcc..a85475858ec48bbefd2f74212b1c34ee275db658 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -90,15 +90,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, else uplo = MorseLower; + if (storev == MorseColumnwise) { + notransV = MorseNoTrans; + transV = MorseConjTrans; + } + else { + notransV = MorseConjTrans; + transV = MorseNoTrans; + } + if ( side == MorseLeft ) { // Form H C or H^H C // Comments assume H C. When forming H^H C, T gets transposed via transT. - transV = (storev == MorseColumnwise) ? MorseNoTrans : MorseConjTrans; - // W = C^H V cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseConjTrans), morse_lapack_const(transV), + morse_lapack_const(MorseConjTrans), morse_lapack_const(notransV), N, K, M, CUBLAS_SADDR(zone), C, LDC, V, LDV, @@ -114,7 +121,7 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(transV), morse_lapack_const(MorseConjTrans), + morse_lapack_const(notransV), morse_lapack_const(MorseConjTrans), M, N, K, CUBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, @@ -124,11 +131,9 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, // Form C H or C H^H // Comments assume C H. When forming C H^H, T gets transposed via trans. - transV = (storev == MorseColumnwise) ? MorseConjTrans : MorseNoTrans; - // W = C V cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(transV), + morse_lapack_const(MorseNoTrans), morse_lapack_const(notransV), M, K, N, CUBLAS_SADDR(zone), C, LDC, V, LDV, diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c index f074b8ae8bcf8859c6167310bdefd02074332c62..88128117d6db2449768a70b933f8d62d5de49174 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ b/runtime/starpu/codelets/codelet_ztsmlq.c @@ -266,8 +266,4 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUDA) CODELETS(ztsmlq, 5, cl_ztsmlq_cpu_func, cl_ztsmlq_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(ztsmlq, 5, cl_ztsmlq_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index ea579179f2b1edc1193f3fc0adece0a220fae134..3d2839e487ba403ccce8bbdd9fb4e4c1784db4df 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -299,8 +299,4 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUDA) CODELETS(ztsmqr, 5, cl_ztsmqr_cpu_func, cl_ztsmqr_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(ztsmqr, 5, cl_ztsmqr_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index dae9bbe1e904cc29145d9020a90a5ac414d6effb..4c913d17873ab5f397f040cf035e014ca5be1e04 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -225,8 +225,4 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUDA) -CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, 0) -#else -CODELETS_CPU(zunmlq, 4, cl_zunmlq_cpu_func) -#endif +CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, STARPU_CUDA_ASYNC); diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 2c2a5d66851b477e74c8fa6ef8e2704b98098e57..61bec4f56863a7160553df46f1025ff3f2461af0 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -251,8 +251,4 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUDA) -CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, 0) -#else -CODELETS_CPU(zunmqr, 4, cl_zunmqr_cpu_func) -#endif +CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, STARPU_CUDA_ASYNC)