From 31e42080ba824dc38a6888a55496684f0418e6e0 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Sun, 4 Dec 2016 22:36:56 +0000
Subject: [PATCH] Make unmqr/unmlq call the larfb

---
 coreblas/compute/core_ztsmqr.c  |  6 ++---
 cudablas/compute/CMakeLists.txt |  6 ++---
 cudablas/compute/cuda_ztsmqr.c  |  1 -
 cudablas/compute/cuda_zunmlqt.c | 44 ++++++++++++++++-----------------
 cudablas/compute/cuda_zunmqrt.c | 44 ++++++++++++++++-----------------
 cudablas/include/cudablas_z.h   |  1 +
 6 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/coreblas/compute/core_ztsmqr.c b/coreblas/compute/core_ztsmqr.c
index 0792655ee..10d4f2d03 100644
--- a/coreblas/compute/core_ztsmqr.c
+++ b/coreblas/compute/core_ztsmqr.c
@@ -254,10 +254,10 @@ int CORE_ztsmqr(MORSE_enum side, MORSE_enum trans,
         CORE_zparfb(
             side, trans, MorseForward, MorseColumnwise,
             mi, ni, M2, N2, kb, 0,
-            &A1[LDA1*jc+ic], LDA1,
+            A1 + LDA1*jc+ic, LDA1,
             A2, LDA2,
-            &V[LDV*i], LDV,
-            &T[LDT*i], LDT,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
             WORK, LDWORK);
     }
     return MORSE_SUCCESS;
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 7ef91aee0..d05705eb7 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -32,6 +32,7 @@ set(ZSRC
     cuda_zhemm.c
     cuda_zher2k.c
     cuda_zherk.c
+    cuda_zlarfb.c
     cuda_zparfb.c
     cuda_zsymm.c
     cuda_zsyr2k.c
@@ -40,6 +41,8 @@ set(ZSRC
     cuda_ztrsm.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    cuda_zunmlqt.c
+    cuda_zunmqrt.c
     )
 
 if( CHAMELEON_USE_MAGMA )
@@ -50,15 +53,12 @@ if( CHAMELEON_USE_MAGMA )
     cuda_zgessm.c
     cuda_zgetrf.c
     cuda_zlauum.c
-    cuda_zparfb.c
     cuda_zpotrf.c
     cuda_zssssm.c
     cuda_ztrtri.c
     cuda_ztslqt.c
     cuda_ztsqrt.c
     cuda_ztstrf.c
-    cuda_zunmlqt.c
-    cuda_zunmqrt.c
     )
 endif()
 
diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c
index a234d2f21..e104a085f 100644
--- a/cudablas/compute/cuda_ztsmqr.c
+++ b/cudablas/compute/cuda_ztsmqr.c
@@ -132,7 +132,6 @@ int CUDA_ztsmqr(
             ni = N1 - i;
             jc = i;
         }
-
         /*
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c
index 55ae68ed6..c9a154733 100644
--- a/cudablas/compute/cuda_zunmlqt.c
+++ b/cudablas/compute/cuda_zunmlqt.c
@@ -24,14 +24,14 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
-int CUDA_zunmlqt(
-        magma_side_t side, magma_trans_t trans,
-        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-        const magmaDoubleComplex *A,    magma_int_t LDA,
-        const magmaDoubleComplex *T,    magma_int_t LDT,
-        magmaDoubleComplex *C,    magma_int_t LDC,
-        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+int
+CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans,
+             int M, int N, int K, int IB,
+             const cuDoubleComplex *A,    int LDA,
+             const cuDoubleComplex *T,    int LDT,
+             cuDoubleComplex *C,    int LDC,
+             cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
     int i, kb;
     int i1, i3;
@@ -42,13 +42,13 @@ int CUDA_zunmlqt(
     int mi = M;
 
     /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
+    if ((side != MorseLeft) && (side != MorseRight)) {
         return -1;
     }
     /*
      * NQ is the order of Q and NW is the minimum dimension of WORK
      */
-    if (side == MagmaLeft) {
+    if (side == MorseLeft) {
         nq = M;
         nw = N;
     }
@@ -57,7 +57,7 @@ int CUDA_zunmlqt(
         nw = M;
     }
 
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+    if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
         return -2;
     }
     if (M < 0) {
@@ -84,10 +84,10 @@ int CUDA_zunmlqt(
 
     /* Quick return */
     if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
+        return MORSE_SUCCESS;
 
-    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
-        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
+    if (((side == MorseLeft) && (trans == MorseNoTrans))
+        || ((side == MorseRight) && (trans != MorseNoTrans))) {
         i1 = 0;
         i3 = IB;
     }
@@ -106,7 +106,7 @@ int CUDA_zunmlqt(
     for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
         kb = min(IB, K-i);
 
-        if (side == MagmaLeft) {
+        if (side == MorseLeft) {
             /*
              * H or H' is applied to C(i:m,1:n)
              */
@@ -121,13 +121,13 @@ int CUDA_zunmlqt(
             jc = i;
         }
 
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
+        CUDA_zlarfb( side, trans, MorseForward, MorseRowwise,
+                     mi, ni, kb,
+                     A + LDA * i  + i,  LDA,
+                     T + LDT * i,       LDT,
+                     C + LDC * jc + ic, LDC,
+                     WORK, LDWORK, CUBLAS_STREAM_VALUE);
     }
     return MORSE_SUCCESS;
 }
-#endif
+
diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c
index e868b1e92..6032cabc4 100644
--- a/cudablas/compute/cuda_zunmqrt.c
+++ b/cudablas/compute/cuda_zunmqrt.c
@@ -24,14 +24,14 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
-int CUDA_zunmqrt(
-        magma_side_t side, magma_trans_t trans,
-        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-        const magmaDoubleComplex *A,    magma_int_t LDA,
-        const magmaDoubleComplex *T,    magma_int_t LDT,
-        magmaDoubleComplex *C,    magma_int_t LDC,
-        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+int
+CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans,
+             int M, int N, int K, int IB,
+             const cuDoubleComplex *A,    int LDA,
+             const cuDoubleComplex *T,    int LDT,
+             cuDoubleComplex *C,    int LDC,
+             cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
     int i, kb;
     int i1, i3;
@@ -42,13 +42,13 @@ int CUDA_zunmqrt(
     int mi = M;
 
     /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
+    if ((side != MorseLeft) && (side != MorseRight)) {
         return -1;
     }
     /*
      * NQ is the order of Q and NW is the minimum dimension of WORK
      */
-    if (side == MagmaLeft) {
+    if (side == MorseLeft) {
         nq = M;
         nw = N;
     }
@@ -57,7 +57,7 @@ int CUDA_zunmqrt(
         nw = M;
     }
 
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+    if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
         return -2;
     }
     if (M < 0) {
@@ -84,10 +84,10 @@ int CUDA_zunmqrt(
 
     /* Quick return */
     if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
+        return MORSE_SUCCESS;
 
-    if (((side == MagmaLeft) && (trans != MagmaNoTrans))
-        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
+    if (((side == MorseLeft) && (trans != MorseNoTrans))
+        || ((side == MorseRight) && (trans == MorseNoTrans))) {
         i1 = 0;
         i3 = IB;
     }
@@ -99,7 +99,7 @@ int CUDA_zunmqrt(
     for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
         kb = min(IB, K-i);
 
-        if (side == MagmaLeft) {
+        if (side == MorseLeft) {
             /*
              * H or H' is applied to C(i:m,1:n)
              */
@@ -114,14 +114,14 @@ int CUDA_zunmqrt(
             jc = i;
         }
 
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
+        CUDA_zlarfb( side, trans, MorseForward, MorseColumnwise,
+                     mi, ni, kb,
+                     A + LDA * i  + i,  LDA,
+                     T + LDT * i,       LDT,
+                     C + LDC * jc + ic, LDC,
+                     WORK, LDWORK,
+                     CUBLAS_STREAM_VALUE );
     }
 
     return MORSE_SUCCESS;
 }
-#endif
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
index 064583858..6f622b44e 100644
--- a/cudablas/include/cudablas_z.h
+++ b/cudablas/include/cudablas_z.h
@@ -54,6 +54,7 @@ int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComple
 int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
 int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info);
-- 
GitLab