From b3bd42953d79847301ac8186353bfec9b44499d5 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Sun, 4 Dec 2016 23:30:11 +0000
Subject: [PATCH] All LQ/QR routines are working on GPUS again. Let's hope I
 will not have to fix them one more time.

---
 compute/pzunmlqrh.c                      | 10 +++++-----
 cudablas/compute/cuda_zlarfb.c           | 19 ++++++++++++-------
 runtime/starpu/codelets/codelet_ztsmlq.c |  4 ----
 runtime/starpu/codelets/codelet_ztsmqr.c |  4 ----
 runtime/starpu/codelets/codelet_zunmlq.c |  6 +-----
 runtime/starpu/codelets/codelet_zunmqr.c |  6 +-----
 6 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index e26da4f03..ce3cbd1db 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -77,7 +77,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
      */
     ws_worker = A->nb * ib;
 
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
      * zunmlq = A->nb * ib
@@ -118,7 +118,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -230,7 +230,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -307,7 +307,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -346,7 +346,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c
index 99edcbe4a..a85475858 100644
--- a/cudablas/compute/cuda_zlarfb.c
+++ b/cudablas/compute/cuda_zlarfb.c
@@ -90,15 +90,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
     else
         uplo = MorseLower;
 
+    if (storev == MorseColumnwise) {
+        notransV = MorseNoTrans;
+        transV   = MorseConjTrans;
+    }
+    else {
+        notransV = MorseConjTrans;
+        transV   = MorseNoTrans;
+    }
+
     if ( side == MorseLeft ) {
         // Form H C or H^H C
         // Comments assume H C. When forming H^H C, T gets transposed via transT.
 
-        transV = (storev == MorseColumnwise) ? MorseNoTrans : MorseConjTrans;
-
         // W = C^H V
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseConjTrans), morse_lapack_const(transV),
+                     morse_lapack_const(MorseConjTrans), morse_lapack_const(notransV),
                      N, K, M,
                      CUBLAS_SADDR(zone),  C, LDC,
                                           V, LDV,
@@ -114,7 +121,7 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
 
         // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(transV), morse_lapack_const(MorseConjTrans),
+                     morse_lapack_const(notransV), morse_lapack_const(MorseConjTrans),
                      M, N, K,
                      CUBLAS_SADDR(mzone), V,    LDV,
                                           WORK, LDWORK,
@@ -124,11 +131,9 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
         // Form C H or C H^H
         // Comments assume C H. When forming C H^H, T gets transposed via trans.
 
-        transV = (storev == MorseColumnwise) ? MorseConjTrans : MorseNoTrans;
-
         // W = C V
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseNoTrans), morse_lapack_const(transV),
+                     morse_lapack_const(MorseNoTrans), morse_lapack_const(notransV),
                      M, K, N,
                      CUBLAS_SADDR(zone),  C, LDC,
                                           V, LDV,
diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c
index f074b8ae8..88128117d 100644
--- a/runtime/starpu/codelets/codelet_ztsmlq.c
+++ b/runtime/starpu/codelets/codelet_ztsmlq.c
@@ -266,8 +266,4 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_CUDA)
 CODELETS(ztsmlq, 5, cl_ztsmlq_cpu_func, cl_ztsmlq_cuda_func, STARPU_CUDA_ASYNC)
-#else
-CODELETS_CPU(ztsmlq, 5, cl_ztsmlq_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index ea579179f..3d2839e48 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -299,8 +299,4 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_CUDA)
 CODELETS(ztsmqr, 5, cl_ztsmqr_cpu_func, cl_ztsmqr_cuda_func, STARPU_CUDA_ASYNC)
-#else
-CODELETS_CPU(ztsmqr, 5, cl_ztsmqr_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index dae9bbe1e..4c913d178 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -225,8 +225,4 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_CUDA)
-CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, 0)
-#else
-CODELETS_CPU(zunmlq, 4, cl_zunmlq_cpu_func)
-#endif
+CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, STARPU_CUDA_ASYNC);
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index 2c2a5d668..61bec4f56 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -251,8 +251,4 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_CUDA)
-CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, 0)
-#else
-CODELETS_CPU(zunmqr, 4, cl_zunmqr_cpu_func)
-#endif
+CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, STARPU_CUDA_ASYNC)
-- 
GitLab