From 91e7e27a2b84b306dab4e5a21122adf5c00884a6 Mon Sep 17 00:00:00 2001
From: Florent Pruvost <florent.pruvost@inria.fr>
Date: Thu, 17 Sep 2015 08:16:19 +0000
Subject: [PATCH] add cudablas library to make calls to cuda kernels (magma
 here, cublas will follow), no more calls to magma in runtime/starpu codelets

---
 CMakeLists.txt                                |  13 +
 control/common.h                              |   4 +-
 example/basic_zposv/CMakeLists.txt            |   4 +
 example/lapack_to_morse/CMakeLists.txt        |   4 +
 runtime/starpu/CMakeLists.txt                 |   5 +
 runtime/starpu/codelets/codelet_zcallback.c   |  11 +-
 runtime/starpu/codelets/codelet_zgelqt.c      | 163 +----
 runtime/starpu/codelets/codelet_zgeqrt.c      | 283 +-------
 runtime/starpu/codelets/codelet_zgessm.c      |  21 +-
 .../starpu/codelets/codelet_zgetrf_incpiv.c   |  13 +-
 .../starpu/codelets/codelet_zgetrf_nopiv.c    |   2 +-
 runtime/starpu/codelets/codelet_zlauum.c      |   8 +-
 runtime/starpu/codelets/codelet_zpotrf.c      |  11 +-
 runtime/starpu/codelets/codelet_zssssm.c      |   2 +-
 runtime/starpu/codelets/codelet_ztrtri.c      |  10 +-
 runtime/starpu/codelets/codelet_ztslqt.c      | 166 +----
 runtime/starpu/codelets/codelet_ztsmlq.c      | 132 +---
 runtime/starpu/codelets/codelet_ztsmqr.c      | 684 +-----------------
 runtime/starpu/codelets/codelet_ztsqrt.c      | 192 +----
 runtime/starpu/codelets/codelet_ztstrf.c      |  15 +-
 runtime/starpu/codelets/codelet_zunmlq.c      | 113 +--
 runtime/starpu/codelets/codelet_zunmqr.c      | 107 +--
 testing/CMakeLists.txt                        |   5 +
 timing/CMakeLists.txt                         |   7 +-
 24 files changed, 114 insertions(+), 1861 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3784f1019..323a92151 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -789,6 +789,9 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/")
             )
             link_directories(${MAGMA_LIBRARY_DIRS})
         endif()
+        if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+            list(APPEND CHAMELEON_DEP -lcudablas)
+        endif()
 
         list(APPEND CHAMELEON_DEP
         -lcoreblas
@@ -842,6 +845,16 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/")
     #------------------------------------------------------------------------------
 
 
+    ###############################################################################
+    # Cudablas library (kernels for CUDAs) #
+    ########################################
+
+    if(CHAMELEON_USE_CUDA)
+        add_subdirectory(cudablas)
+    endif()
+    #------------------------------------------------------------------------------
+
+
     ###############################################################################
     # Main library #
     ################
diff --git a/control/common.h b/control/common.h
index 539e5e1cc..283286e8b 100644
--- a/control/common.h
+++ b/control/common.h
@@ -81,8 +81,10 @@
 #ifndef LAPACK_NAME
 #define LAPACK_NAME(a, b) lapackef77_##a
 #endif
-#include "coreblas/include/lapacke.h"
 #include "coreblas/include/coreblas.h"
+#if defined(CHAMELEON_USE_CUDA)
+#include "cudablas/include/cudablas.h"
+#endif
 
 #include "morse.h"
 
diff --git a/example/basic_zposv/CMakeLists.txt b/example/basic_zposv/CMakeLists.txt
index e545f7581..ec479cc05 100644
--- a/example/basic_zposv/CMakeLists.txt
+++ b/example/basic_zposv/CMakeLists.txt
@@ -69,6 +69,10 @@ endif()
 
 if(NOT CHAMELEON_SIMULATION)
 
+    if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_examples
+        cudablas)
+    endif()
     if(CHAMELEON_USE_CUDA)
         list(APPEND libs_for_examples
              ${CUDA_LIBRARIES}
diff --git a/example/lapack_to_morse/CMakeLists.txt b/example/lapack_to_morse/CMakeLists.txt
index e531b183e..b40720e51 100644
--- a/example/lapack_to_morse/CMakeLists.txt
+++ b/example/lapack_to_morse/CMakeLists.txt
@@ -69,6 +69,10 @@ unset(libs_for_step0)
 
 if(NOT CHAMELEON_SIMULATION)
 
+    if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_ltm
+        cudablas)
+    endif()
     if(CHAMELEON_USE_CUDA)
         list(APPEND libs_for_ltm
              ${CUDA_LIBRARIES}
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 6063f65d3..fbb9b504d 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -188,6 +188,11 @@ add_dependencies(chameleon_starpu
   control_include
   runtime_starpu_include
 )
+if (CHAMELEON_USE_CUDA)
+    add_dependencies(chameleon_starpu
+      cudablas_include
+    )
+endif()
 
 # installation
 # ------------
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 3fcf9d85b..3d7bce440 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -3,24 +3,25 @@
  * @copyright (c) 2009-2014 The University of Tennessee and The University
  *                          of Tennessee Research Foundation.
  *                          All rights reserved.
- * @copyright (c) 2012-2014 Inria. All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
  * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
  *
  **/
 
 /**
  *
- *  @file codelet_zpotrf.c
+ *  @file codelet_zcallback.c
  *
- *  MAGMA codelets kernel
- *  MAGMA is a software package provided by Univ. of Tennessee,
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
  *  Univ. of California Berkeley and Univ. of Colorado Denver,
  *  and INRIA Bordeaux Sud-Ouest
  *
  *  @version 2.3.1
  *  @author Mathieu Faverge
  *  @author Cedric Augonnet
- *  @date 2011-06-01
+ *  @author Florent Pruvost
+ *  @date 2015-09-16
  *  @precisions normal z -> c d s
  *
  **/
diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index 665e88e95..f268f6363 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -150,154 +150,6 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
 }
 
 #if defined(CHAMELEON_USE_MAGMA)
-magma_int_t
-magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
-                  magmaDoubleComplex *da, magma_int_t ldda,
-                  magmaDoubleComplex *v,  magma_int_t ldv,
-                  magmaDoubleComplex *dt, magma_int_t lddt,
-                  magmaDoubleComplex *t,  magma_int_t ldt,
-                  magmaDoubleComplex *dd,
-                  magmaDoubleComplex *d,  magma_int_t ldd,
-                  magmaDoubleComplex *tau,
-                  magmaDoubleComplex *hwork,
-                  magmaDoubleComplex *dwork)
-{
-#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
-#define v_ref(a_1,a_2)  ( v+(a_2)*(ldv) + (a_1))
-#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
-#define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))
-
-    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
-    double _Complex one=1.;
-    CUstream stream;
-
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
-
-    if (m < 0) {
-        return -1;
-    } else if (n < 0) {
-        return -2;
-    } else if (ldda < max(1,m)) {
-        return -4;
-    }
-
-    k = min(m,n);
-    if (k == 0) {
-        hwork[0] = *((magmaDoubleComplex*) &one);
-        return MAGMA_SUCCESS;
-    }
-
-    lddwork= m;
-
-    /* lower parts of little T must be zero: memset to 0 for simplicity */
-    memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
-    cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
-
-    /* copy first panel of A on the host */
-    cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
-                    da_ref(0, 0), ldda,
-                    v, ldv);
-
-    /* Use blocked code initially */
-    for (i = 0; i < k; i += nb) {
-
-        ib = min(k-i, nb);
-        if (i+nb >= m) ib = min(m-i, nb);
-        cols = n-i;
-
-        if (i > 0){
-
-            /* copy panel of A from device to host */
-            cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
-                            da_ref(i, 0), ldda,
-                            v, ldv);
-
-            /* Apply H' to A(i+2*ib:m, i:n) from the right */
-            rows = m-old_i-2*old_ib;
-            if (rows > 0){
-                magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                                  rows, n-old_i, old_ib,
-                                  da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                                  da_ref(old_i+2*old_ib, old_i), ldda,
-                                  dwork, lddwork);
-            }
-
-            /* copy the lower diag tile into d_A */
-            magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
-                               dd, ldd, da_ref(old_i, old_i), ldda, stream);
-
-        }
-
-        /* Form the triangular factor of the block reflector on the host
-         H = H'(i+ib-1) . . . H(i+1) H(i) */
-        CORE_zgelqt(ib, cols, ib,
-                    (double _Complex*) v_ref(0,i), ldv,
-                    (double _Complex*) t_ref(0,0), ldt,
-                    (double _Complex*) tau+i,
-                    (double _Complex*) hwork);
-
-        if ( i + ib < m ){
-            /* put 0s in the lower triangular part of a panel (and 1s on the
-             diagonal); copy the lower triangular in d */
-            CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
-                          (double _Complex*) v_ref(0,i), ldv,
-                          (double _Complex*) d, ldd);
-
-            /* copy from host to device a tile diag */
-            cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
-                             d, ldd, dd, ldd );
-        }
-
-        /* Send the triangular factor T to the GPU */
-        cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
-                         t_ref(0,0), ldt, dt_ref(0,i), lddt );
-
-        /* A panel (with zeros in lower tri of its diag) is ready to be used
-         in input of zlarfb_gpu: we send the panel to the gpu */
-        cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
-                         v_ref(0,i), ldv, da_ref(i,i), ldda );
-
-        if (i + ib < m) {
-
-            if (i+2*ib < m){
-                rows = ib;
-            }
-            else{
-                rows = m-i-ib;
-            }
-            /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
-            magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                              rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
-                              lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
-            old_i = i;
-            old_ib = ib;
-            if (i+nb >= k){
-                /* Apply H' to A(i+2*ib:m, i:n) from the right */
-                rows = m-old_i-2*old_ib;
-                if (rows > 0){
-                    magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                                      rows, cols, old_ib,
-                                      da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                                      da_ref(old_i+2*old_ib, old_i), ldda,
-                                      dwork, lddwork);
-                }
-                /* copy the upper diag tile into d_A */
-                magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
-                                   dd, ldd, da_ref(old_i, old_i), ldda, stream);
-            }
-        }
-
-    }
-
-#undef da_ref
-#undef v_ref
-#undef dt_ref
-#undef t_ref
-
-    return MAGMA_SUCCESS;
-} /* magma_zgelqt_gpu */
-
 static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -307,6 +159,7 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
     cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
     int lda, ldt;
+    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
 
@@ -326,11 +179,15 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + max(m,n);
     h_D   = h_W   + ib*ib;
 
-    magma_zgelqt_gpu( m, n, ib,
-                      d_A, lda, h_A, ib,
-                      d_T, ldt, h_T, ib,
-                      d_D, h_D, ib, h_TAU,
-                      h_W, d_W);
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
+    CUDA_zgelqt(
+            m, n, ib,
+            d_A, lda, h_A, ib,
+            d_T, ldt, h_T, ib,
+            d_D, h_D, ib, h_TAU,
+            h_W, d_W, stream);
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 7257676cd..880ca82ef 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -152,274 +152,6 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
 
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-#if defined(CHAMELEON_USE_CUBLAS_V2)
-magma_int_t
-magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
-                   magma_int_t M, magma_int_t N,
-                   magmaDoubleComplex *A, magma_int_t LDA,
-                   magmaDoubleComplex *B, magma_int_t LDB,
-                   CUstream stream)
-{
-    int i, j;
-    magmaDoubleComplex *cola, *colb;
-    cublasHandle_t handle;
-    cublasStatus_t stat;
-
-    stat = cublasCreate(&handle);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("CUBLAS initialization failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
-
-    stat = cublasSetStream(handle, stream);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("cublasSetStream failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
-
-    if (M < 0) {
-        return -1;
-    }
-    if (N < 0) {
-        return -2;
-    }
-    if ( (LDA < max(1,M)) && (M > 0) ) {
-        return -5;
-    }
-    if ( (LDB < max(1,M)) && (M > 0) ) {
-        return -7;
-    }
-
-    if (side == MagmaLeft){
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(handle, i+1, cola, 1, colb, 1);
-            cudaMemcpyAsync(colb , cola,
-                            (i+1)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }else{
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
-            cudaMemcpyAsync(colb+i , cola+i,
-                            (M-i)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }
-
-    cublasDestroy(handle);
-
-    return MAGMA_SUCCESS;
-}
-#else /* CHAMELEON_USE_CUBLAS_V2 */
-magma_int_t
-magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
-                   magma_int_t M, magma_int_t N,
-                   magmaDoubleComplex *A, magma_int_t LDA,
-                   magmaDoubleComplex *B, magma_int_t LDB,
-                   CUstream stream)
-{
-    int i, j;
-    magmaDoubleComplex *cola, *colb;
-
-    if (M < 0) {
-        return -1;
-    }
-    if (N < 0) {
-        return -2;
-    }
-    if ( (LDA < max(1,M)) && (M > 0) ) {
-        return -5;
-    }
-    if ( (LDB < max(1,M)) && (M > 0) ) {
-        return -7;
-    }
-
-    if (side == MagmaLeft){
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(i+1, cola, 1, colb, 1);
-            cudaMemcpyAsync(colb , cola,
-                            (i+1)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }else{
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
-            cudaMemcpyAsync(colb+i , cola+i,
-                            (M-i)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }
-
-    return MAGMA_SUCCESS;
-}
-#endif /* CHAMELEON_USE_CUBLAS_V2 */
-
-magma_int_t
-magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
-                  magmaDoubleComplex *da, magma_int_t ldda,
-                  magmaDoubleComplex *v,  magma_int_t ldv,
-                  magmaDoubleComplex *dt, magma_int_t lddt,
-                  magmaDoubleComplex *t,  magma_int_t ldt,
-                  magmaDoubleComplex *dd,
-                  magmaDoubleComplex *d,  magma_int_t ldd,
-                  magmaDoubleComplex *tau,
-                  magmaDoubleComplex *hwork,
-                  magmaDoubleComplex *dwork)
-{
-#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
-#define v_ref(a_1,a_2)  ( v+(a_2)*(ldv) + (a_1))
-#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
-#define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))
-
-    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
-    double _Complex one=1.;
-    CUstream stream;
-
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
-//    int lwkopt = n * nb;
-//    hwork[0] = *((magmaDoubleComplex*) &lwkopt);
-
-    if (m < 0) {
-        return -1;
-    } else if (n < 0) {
-        return -2;
-    } else if (ldda < max(1,m)) {
-        return -4;
-    }
-
-    k = min(m,n);
-    if (k == 0) {
-        hwork[0] = *((magmaDoubleComplex*) &one);
-        return MAGMA_SUCCESS;
-    }
-
-    lddwork= k;
-
-    /* lower parts of little T must be zero: memset to 0 for simplicity */
-    memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex));
-    cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
-
-    /* copy first panel of A on the host */
-//	cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex),
-//                    da_ref(0, 0), ldda,
-//                    v, ldv);
-    cudaMemcpy( v, da_ref(0,0),
-                m*min(nb,n)*sizeof(magmaDoubleComplex),
-                cudaMemcpyDeviceToHost);
-
-    /* Use blocked code initially */
-    for (i = 0; i < k; i += nb) {
-
-        ib = min(k-i, nb);
-        if (i+nb>=n) ib = min(n-i, nb);
-        rows = m-i;
-
-        if (i>0){
-
-            /* copy panel of A from device to host */
-//        	cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex),
-//                            da_ref(0, i), ldda,
-//                            v, ldv);
-            /* copy panel of A from device to host */
-            cudaMemcpy( v, da_ref(0,i),
-                        m*ib*sizeof(magmaDoubleComplex),
-                        cudaMemcpyDeviceToHost);
-
-            /* Apply H' to A(i:m,i+2*ib:n) from the left */
-            cols = n-old_i-2*old_ib;
-            if (cols > 0){
-                magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
-                                  m-old_i, cols, old_ib,
-                                  da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                                  da_ref(old_i, old_i+2*old_ib), ldda,
-                                  dwork, cols);
-            }
-
-            /* copy the upper diag tile into d_A */
-            magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib,
-                               dd, ldd, da_ref(old_i, old_i), ldda, stream);
-
-        }
-
-        /* Form the triangular factor of the block reflector on the host
-         H = H(i) H(i+1) . . . H(i+ib-1) */
-        CORE_zgeqrt(rows, ib, ib,
-                    (double _Complex*) v_ref(i,0), ldv,
-                    (double _Complex*) t_ref(0,0), ldt,
-                    (double _Complex*) tau+i,
-                    (double _Complex*) hwork);
-
-        if ( i + ib < n ){
-            /* put 0s in the upper triangular part of a panel (and 1s on the
-             diagonal); copy the upper triangular in d */
-            CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib,
-                          (double _Complex*) v_ref(i,0), ldv,
-                          (double _Complex*) d, ldd);
-
-            /* copy from host to device a tile diag */
-            cublasSetMatrix( min(rows,ib), ib, sizeof(magmaDoubleComplex),
-                             d, ldd, dd, ldd );
-        }
-
-        /* Send the triangular factor T to the GPU */
-        cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
-                         t_ref(0,0), ldt, dt_ref(0,i), lddt );
-
-        /* A panel (with zeros in upper tri of its diag) is ready to be used
-         in input of zlarfb_gpu: we send the panel to the gpu */
-        cublasSetMatrix( rows, ib, sizeof(magmaDoubleComplex),
-                         v_ref(i,0), ldv, da_ref(i,i), ldda );
-
-        if (i + ib < n) {
-
-            if (i+2*ib < n){
-                cols = ib;
-            }
-            else{
-                cols = n-i-ib;
-            }
-            /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
-            magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
-                              rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
-                              lddt, da_ref(i,i+ib), ldda, dwork, cols);
-            old_i = i;
-            old_ib = ib;
-            if (i+nb>=k){
-                /* Apply H' to A(i:m,i+2*ib:n) from the left */
-                cols = n-old_i-2*old_ib;
-                if (cols > 0){
-                    magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
-                                      rows, cols, old_ib,
-                                      da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                                      da_ref(old_i, old_i+2*old_ib), ldda,
-                                      dwork, cols);
-                }
-                /* copy the upper diag tile into d_A */
-                magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib,
-                                   dd, ldd, da_ref(old_i, old_i), ldda, stream);
-            }
-        }
-
-    }
-
-#undef da_ref
-#undef v_ref
-#undef dt_ref
-#undef t_ref
-
-    return MAGMA_SUCCESS;
-} /* magma_zgeqrt_gpu */
-
 static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -429,6 +161,7 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
     cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
     int lda, ldt;
+    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
 
@@ -448,11 +181,15 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + max(m,n);
     h_D   = h_W   + ib*ib;
 
-    magma_zgeqrt_gpu( m, n, ib,
-                      d_A, lda, h_A, m,
-                      d_T, ldt, h_T, ib,
-                      d_D, h_D, ib, h_TAU,
-                      h_W, d_W);
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
+    CUDA_zgeqrt(
+            m, n, ib,
+            d_A, lda, h_A, m,
+            d_T, ldt, h_T, ib,
+            d_D, h_D, ib, h_TAU,
+            h_W, d_W, stream);
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c
index a63c803d0..7a74296e5 100644
--- a/runtime/starpu/codelets/codelet_zgessm.c
+++ b/runtime/starpu/codelets/codelet_zgessm.c
@@ -146,9 +146,6 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *dL, *dD, *dA;
     int lddl, lddd, ldda;
     int info = 0;
-
-    int ret;
-
     /*
      *  hwork => nb*nb
      */
@@ -157,21 +154,9 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
     dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda);
 
-    /* The kernel is just using the inverted part or nothing */
-    if ( lddl >= 2*ib ) {
-      dL += ib;
-      ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib,
-			      IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
-    }
-    else {
-      ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib,
-			      IPIV, NULL, 1, dD, lddd, dA, ldda, &info );
-    }
-
-    if (ret != MAGMA_SUCCESS) {
-        fprintf(stderr, "Error in MAGMA: %d\n", ret);
-        exit(-1);
-    }
+    CUDA_zgessm(
+            MagmaColMajor, m, n, k, ib,
+            IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
 
     cudaThreadSynchronize();
 
diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
index 489a48665..acd0ee798 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
@@ -214,12 +214,13 @@ static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
       hL = NULL;
     }
 
-    magma_zgetrf_incpiv_gpu( MagmaColMajor, m, n, ib,
-                             hA, lda, dA, lda,
-                             hL, ib,  dL, ldl,
-                             IPIV,
-                             dwork, lda,
-                             &info );
+    CUDA_zgetrf_incpiv(
+            MagmaColMajor, m, n, ib,
+            hA, lda, dA, lda,
+            hL, ib,  dL, ldl,
+            IPIV,
+            dwork, lda,
+            &info );
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
index 3db007331..0792adaba 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
@@ -136,7 +136,7 @@ static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
     dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    magma_zgetrf_nopiv_gpu( m, n, dA, lda, &info );
+    CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
     cudaThreadSynchronize();
 }
 #endif
diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c
index 0c3ba14bd..2e206cc13 100644
--- a/runtime/starpu/codelets/codelet_zlauum.c
+++ b/runtime/starpu/codelets/codelet_zlauum.c
@@ -77,7 +77,6 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
 static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
-    int ret;
     int info = 0;
     int N;
     cuDoubleComplex *A;
@@ -85,12 +84,7 @@ static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
 
     A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
-    ret = magma_zlauum_gpu( uplo, N, A, LDA, &info);
-    if (ret != MAGMA_SUCCESS) {
-        fprintf(stderr, "Error in MAGMA: %d\n", ret);
-        exit(-1);
-    }
-
+    CUDA_zlauum( uplo, N, A, LDA, &info);
     cudaThreadSynchronize();
     return;
 }
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 939a9e1da..705c81f77 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -89,8 +89,6 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
     /* cuDoubleComplex *hA; */
     int lda;
     int iinfo;
-
-    int ret;
     int info = 0;
 
     A  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
@@ -107,14 +105,7 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
 //         exit(-1);
 //     }
 
-    ret = magma_zpotrf_gpu(
-        uplo,
-        n, A, lda, &info);
-/*	hA, stream );*/
-     if (ret != MAGMA_SUCCESS) {
-        fprintf(stderr, "Error in MAGMA: %d\n", ret);
-        exit(-1);
-    }
+    CUDA_zpotrf( uplo, n, A, lda, &info);
 
     cudaThreadSynchronize();
 //     cudaStreamDestroy( stream[1] );
diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c
index 019daf9f1..6837607a8 100644
--- a/runtime/starpu/codelets/codelet_zssssm.c
+++ b/runtime/starpu/codelets/codelet_zssssm.c
@@ -202,7 +202,7 @@ static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
         dL1 += ib;
     }
 
-    magma_zssssm_gpu(
+    CUDA_zssssm(
         MagmaColMajor, m1, n1, m2, n2, k, ib,
         dA1, lda1, dA2, lda2,
         dL1, ldl1, dL2, ldl2,
diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c
index 4f7145c59..20f6ff44c 100644
--- a/runtime/starpu/codelets/codelet_ztrtri.c
+++ b/runtime/starpu/codelets/codelet_ztrtri.c
@@ -90,19 +90,11 @@ static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *A;
     int LDA;
     int iinfo;
-
-    int ret;
     int info = 0;
 
     A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
-    ret = magma_ztrtri_gpu( uplo, diag,
-			    N, A, LDA, &info);
-     if (ret != MAGMA_SUCCESS) {
-        fprintf(stderr, "Error in MAGMA: %d\n", ret);
-        exit(-1);
-    }
-
+    CUDA_ztrtri( uplo, diag, N, A, LDA, &info);
     cudaThreadSynchronize();
     return;
 }
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
index 6957cce31..264d50c51 100644
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ b/runtime/starpu/codelets/codelet_ztslqt.c
@@ -171,159 +171,6 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
 }
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-magma_int_t
-magma_ztslqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
-                  magmaDoubleComplex *da1, magma_int_t ldda1,
-                  magmaDoubleComplex *da2, magma_int_t ldda2,
-                  magmaDoubleComplex *a2,  magma_int_t lda2,
-                  magmaDoubleComplex *dt,  magma_int_t lddt,
-                  magmaDoubleComplex *t,  magma_int_t ldt,
-                  magmaDoubleComplex *dd,
-                  magmaDoubleComplex *d,  magma_int_t ldd,
-                  magmaDoubleComplex *tau,
-                  magmaDoubleComplex *hwork,
-                  magmaDoubleComplex *dwork,
-                  CUstream stream)
-{
-#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1))
-#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1))
-#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1))
-#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1))
-#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1))
-#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1))
-
-    int i, k, lddwork, old_i, old_ib, rows, cols;
-    int ib;
-    double _Complex one=1.;
-
-    if (m < 0) {
-        return -1;
-    } else if (n < 0) {
-        return -2;
-    } else if (ldda2 < max(1,m)) {
-        return -4;
-    }
-
-    k = min(m,n);
-    if (k == 0) {
-        hwork[0] = *((magmaDoubleComplex*) &one);
-        return MAGMA_SUCCESS;
-    }
-
-    lddwork= m;
-
-    /* lower parts of little T must be zero: memset all to 0 for simplicity */
-    memset(t, 0, nb*n*sizeof(magmaDoubleComplex));
-    cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
-
-    k = min(m, nb); // m can be lower than IB
-    /* copy the first diag tile of A1 from device to host: da1 -> d */
-    cublasGetMatrix(k, k, sizeof(magmaDoubleComplex),
-                    da1_ref(0, 0), ldda1,
-                    d, ldd);
-
-    /* copy first panel of A2 from device to host: da2 -> a2 */
-    cublasGetMatrix(k, n, sizeof(magmaDoubleComplex),
-                    da2_ref(0, 0), ldda2,
-                    a2, lda2);
-
-    /* This is only blocked code for now */
-    for (i = 0; i < m; i += nb) {
-
-        ib = min(m-i, nb);
-        cols = n;
-
-        /* Send the next panel (diagonal block of A1 & block column of A2)
-           to the CPU (in work_a1 and work_a2) and apply tsmqr update on the
-           remaining non updated panels */
-        if (i>0) {
-
-            /* copy the diag tile of A1 from device to host: da1 -> d */
-            cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                            da1_ref(i, i), ldda1,
-                            d, ldd);
-
-            /* copy panel of A2 from device to host: da2 -> a2 */
-            cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex),
-                            da2_ref(i, 0), ldda2,
-                            a2, lda2);
-
-            /* Apply H' to A(i+2*ib:m,i:n) from the left */
-            rows = m-old_i-2*old_ib;
-            if (rows > 0){
-                magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans,
-                                  rows, old_ib, rows, cols, old_ib, old_ib,
-                                  da1_ref(old_i+2*old_ib, old_i), ldda1,
-                                  da2_ref(old_i+2*old_ib, 0), ldda2,
-                                  da2_ref(old_i, 0), ldda2,
-                                  dt_ref(0, old_i), lddt,
-                                  dwork, lddwork,
-                                  dwork + nb * lddwork, nb,
-                                  stream );
-            }
-
-        }
-
-        /* compute LQ factorization of the panel of A2 ib x cols */
-        CORE_ztslqt(ib, cols, ib,
-                    (double _Complex*) d, ldd,
-                    (double _Complex*) a2, lda2,
-                    (double _Complex*) t, ldt,
-                    (double _Complex*) tau,
-                    (double _Complex*) hwork);
-
-        /* Send the panel from A2 back to the GPU */
-        cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
-                        a2, lda2,
-                        da2_ref(i, 0), ldda2);
-
-        /* Send the triangular factor T from hwork to the GPU */
-        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        t, ldt,
-                        dt_ref(0, i), lddt);
-
-        /* get back the diag tile in A1 from host to device: d -> da1 */
-        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        d, ldd,
-                        da1_ref(i, i), ldda1);
-
-        /* tsmlq update on one panel forward (look ahead 1) */
-        if (i + ib < m) {
-
-            if (i+2*ib < m){
-                rows = ib;
-            }
-            else{
-                rows = m-i-ib;
-            }
-
-            /* Apply H' to A(i+ib:i+2*ib,i:n) from the right */
-            magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans,
-                              rows, ib, rows, cols, ib, ib,
-                              da1_ref(i+ib, i), ldda1,
-                              da2_ref(i+ib, 0), ldda2,
-                              da2_ref(i, 0), ldda2,
-                              dt_ref(0, i), lddt,
-                              dwork, lddwork,
-                              dwork + nb * lddwork, nb,
-                              stream );
-            old_i = i;
-            old_ib = ib;
-        }
-    }
-
-#undef da1_ref
-#undef da2_ref
-#undef a2_ref
-#undef t_ref
-#undef dt_ref
-#undef d_ref
-
-    return MAGMA_SUCCESS;
-
-} /* magma_ztslqt_gpu */
-
 static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -353,12 +200,13 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
     h_D   = h_W   + ib*m;
 
     stream = starpu_cuda_get_local_stream();
-    magma_ztslqt_gpu( m, n, ib,
-                      d_A1, lda1, d_A2, lda2,
-                      h_A2, ib,
-                      d_T, ldt, h_T, ib,
-                      d_D, h_D, ib, h_TAU,
-                      h_W, d_W, stream);
+    CUDA_ztslqt(
+            m, n, ib,
+            d_A1, lda1, d_A2, lda2,
+            h_A2, ib,
+            d_T, ldt, h_T, ib,
+            d_D, h_D, ib, h_TAU,
+            h_W, d_W, stream);
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c
index 03b74668e..0b78ac57d 100644
--- a/runtime/starpu/codelets/codelet_ztsmlq.c
+++ b/runtime/starpu/codelets/codelet_ztsmlq.c
@@ -213,135 +213,6 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
 }
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-magma_int_t
-magma_ztsmlq_gpu( magma_side_t side, magma_trans_t trans,
-                  magma_int_t M1, magma_int_t N1,
-                  magma_int_t M2, magma_int_t N2,
-                  magma_int_t K, magma_int_t IB,
-                  magmaDoubleComplex *A1, magma_int_t LDA1,
-                  magmaDoubleComplex *A2, magma_int_t LDA2,
-                  const magmaDoubleComplex *V, magma_int_t LDV,
-                  const magmaDoubleComplex *T, magma_int_t LDT,
-                        magmaDoubleComplex *WORK,  magma_int_t LDWORK,
-                        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
-                  CUstream stream)
-{
-    int i, i1, i3;
-    int NW;
-    int kb;
-    int ic = 0;
-    int jc = 0;
-    int mi = M1;
-    int ni = N1;
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-
-    /* NW is the minimum dimension of WORK */
-    if (side == MagmaLeft) {
-        NW = IB;
-    }
-    else {
-        NW = N1;
-    }
-
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if (M1 < 0) {
-        return -3;
-    }
-    if (N1 < 0) {
-        return -4;
-    }
-    if ( (M2 < 0) ||
-         ( (M2 != M1) && (side == MagmaRight) ) ){
-        return -5;
-    }
-    if ( (N2 < 0) ||
-         ( (N2 != N1) && (side == MagmaLeft) ) ){
-        return -6;
-    }
-    if ((K < 0) ||
-        ( (side == MagmaLeft)  && (K > M1) ) ||
-        ( (side == MagmaRight) && (K > N1) ) ) {
-        return -7;
-    }
-    if (IB < 0) {
-        return -8;
-    }
-    if (LDA1 < max(1,M1)){
-        return -10;
-    }
-    if (LDA2 < max(1,M2)){
-        return -12;
-    }
-    if (LDV < max(1,K)){
-        return -14;
-    }
-    if (LDT < max(1,IB)){
-        return -16;
-    }
-    if (LDWORK < max(1,NW)){
-        return -18;
-    }
-
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
-        return MAGMA_SUCCESS;
-
-    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
-        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
-        i1 = 0;
-        i3 = IB;
-    }
-    else {
-        i1 = ((K-1) / IB)*IB;
-        i3 = -IB;
-    }
-
-    if (trans == MagmaNoTrans) {
-        trans = MagmaConjTrans;
-    }
-    else {
-        trans = MagmaNoTrans;
-    }
-
-    for(i = i1; (i > -1) && (i < K); i += i3) {
-        kb = min(IB, K-i);
-
-        if (side == MagmaLeft) {
-            /*
-             * H or H' is applied to C(i:m,1:n)
-             */
-            mi = M1 - i;
-            ic = i;
-        }
-        else {
-            /*
-             * H or H' is applied to C(1:m,i:n)
-             */
-            ni = N1 - i;
-            jc = i;
-        }
-
-        /*
-         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
-         */
-        magma_zparfb_gpu( side, trans, MagmaForward, MagmaRowwise,
-                          mi, ni, M2, N2, kb, 0,
-                          A1 + LDA1*jc+ic, LDA1,
-                          A2, LDA2,
-                          V + i, LDV,
-                          T + LDT*i, LDT,
-                          WORK, LDWORK, WORKC, LDWORKC, stream );
-    }
-    return MAGMA_SUCCESS;
-}
-
 static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -380,7 +251,7 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
     stream = starpu_cuda_get_local_stream();
     cublasSetKernelStream( stream );
 
-    magma_ztsmlq_gpu( side, trans, m1, n1, m2, n2, k, ib,
+    CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib,
                       A1, lda1, A2, lda2, V, ldv, T, ldt,
                       W, ldwork, WC, ldworkc, stream );
 
@@ -388,7 +259,6 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
     cudaStreamSynchronize( stream );
 #endif
 }
-
 #endif
 
 /*
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index 46a586181..56dee7ab5 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -242,683 +242,6 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
 
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-#if defined(CHAMELEON_USE_CUBLAS_V2)
-magma_int_t
-magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
-		         magma_direct_t direct, magma_storev_t storev,
-		         magma_int_t M1, magma_int_t N1,
-		         magma_int_t M2, magma_int_t N2,
-		         magma_int_t K, magma_int_t L,
-		               magmaDoubleComplex *A1, magma_int_t LDA1,
-		               magmaDoubleComplex *A2, magma_int_t LDA2,
-		         const magmaDoubleComplex *V, magma_int_t LDV,
-		         const magmaDoubleComplex *T, magma_int_t LDT,
-		               magmaDoubleComplex *WORK, magma_int_t LDWORK,
-		               magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
-		               CUstream stream)
-
-{
-#if defined(PRECISION_z) || defined(PRECISION_c)
-    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
-    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
-    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
-#else
-    double zzero = 0.0;
-    double zone  = 1.0;
-    double mzone = -1.0;
-#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
-
-    int j;
-    magma_trans_t transW;
-    magma_trans_t transA2;
-    cublasHandle_t handle;
-    cublasStatus_t stat;
-    cublasOperation_t cublasTrans;
-    cublasOperation_t cublasTransW;
-    cublasOperation_t cublasTransA2;
-
-    stat = cublasCreate(&handle);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("CUBLAS initialization failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
-
-    stat = cublasSetStream(handle, stream);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("cublasSetStream failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
-
-    if (trans == MagmaNoTrans){
-        cublasTrans = CUBLAS_OP_N;
-    }else if(trans == MagmaTrans){
-        cublasTrans = CUBLAS_OP_T;
-    }else if(trans == MagmaConjTrans){
-        cublasTrans = CUBLAS_OP_C;
-    }else{
-        fprintf(stderr, "Error in magma_zparfb_gpu: bad trans parameter %d\n", trans);
-    }
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
-        return -3;
-    }
-    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
-        return -4;
-    }
-    if (M1 < 0) {
-        return -5;
-    }
-    if (N1 < 0) {
-        return -6;
-    }
-    if ((M2 < 0) ||
-        ( (side == MagmaRight) && (M1 != M2) ) ) {
-        return -7;
-    }
-    if ((N2 < 0) ||
-        ( (side == MagmaLeft) && (N1 != N2) ) ) {
-        return -8;
-    }
-    if (K < 0) {
-        return -9;
-    }
-
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
-        return MAGMA_SUCCESS;
-
-    if (direct == MagmaForward) {
-
-        if (side == MagmaLeft) {
-
-            /*
-             * Column or Rowwise / Forward / Left
-             * ----------------------------------
-             *
-             * Form  H * A  or  H' * A  where  A = ( A1 )
-             *                                     ( A2 )
-             */
-
-            /*
-             * W = A1 + V' * A2:
-             *      W = A1
-             *      W = W + V' * A2
-             *
-             */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
-                               K * sizeof(cuDoubleComplex), N1,
-                               cudaMemcpyDeviceToDevice, stream );
-
-            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
-            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
-
-            if (transW == MagmaNoTrans){
-                cublasTransW = CUBLAS_OP_N;
-            }else if(transW == MagmaTrans){
-                cublasTransW = CUBLAS_OP_T;
-            }else if(transW == MagmaConjTrans){
-                cublasTransW = CUBLAS_OP_C;
-            }else{
-                fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
-            }
-            if (transA2 == MagmaNoTrans){
-                cublasTransA2 = CUBLAS_OP_N;
-            }else if(transA2 == MagmaTrans){
-                cublasTransA2 = CUBLAS_OP_T;
-            }else if(transA2 == MagmaConjTrans){
-                cublasTransA2 = CUBLAS_OP_C;
-            }else{
-                fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2);
-            }
-
-            cublasZgemm(handle, cublasTransW, CUBLAS_OP_N,
-                        K, N1, M2,
-                        (const cuDoubleComplex *) &zone,
-                        (const cuDoubleComplex*)V     /* K*M2  */, LDV,
-                        (const cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
-                        (const cuDoubleComplex *) &zone,
-                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
-
-            if (WORKC == NULL) {
-                /* W = op(T) * W */
-                cublasZtrmm( handle,
-                    CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER,
-                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
-                    K, N2,
-                    (const cuDoubleComplex *) &zone,
-                    (const cuDoubleComplex*)T, LDT,
-                    (cuDoubleComplex*)WORK, LDWORK,
-                    (cuDoubleComplex*)WORK, LDWORK);
-
-
-                /* A1 = A1 - W = A1 - op(T) * W */
-                for(j = 0; j < N1; j++) {
-                    cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone,
-                                (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
-                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
-                }
-
-                /* A2 = A2 - op(V) * W  */
-                cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N,
-                            M2, N2, K,
-                            (const cuDoubleComplex *) &mzone,
-                            (const cuDoubleComplex*)V     /* M2*K  */, LDV,
-                            (const cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
-                            (const cuDoubleComplex *) &zone,
-                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
-
-            } else {
-                /* Wc = V * op(T) */
-                cublasZgemm( handle, cublasTransA2, cublasTrans,
-                             M2, K, K,
-                             (const cuDoubleComplex *) &zone,  V,     LDV,
-                                    T,     LDT,
-                             (const cuDoubleComplex *) &zzero, WORKC, LDWORKC );
-
-                /* A1 = A1 - opt(T) * W */
-                cublasZgemm( handle, cublasTrans, CUBLAS_OP_N,
-                             K, N1, K,
-                             (const cuDoubleComplex *) &mzone,
-                             (const cuDoubleComplex *)T,    LDT,
-                             (const cuDoubleComplex *)WORK, LDWORK,
-                             (const cuDoubleComplex *) &zone,
-                             (cuDoubleComplex*)A1,   LDA1 );
-
-                /* A2 = A2 - Wc * W */
-                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                             M2, N2, K,
-                             (const cuDoubleComplex *) &mzone,
-                             (const cuDoubleComplex *)WORKC, LDWORKC,
-                             (const cuDoubleComplex *)WORK,  LDWORK,
-                             (const cuDoubleComplex *) &zone,
-                             (cuDoubleComplex *)A2,    LDA2 );
-            }
-        }
-        else {
-            /*
-             * Column or Rowwise / Forward / Right
-             * -----------------------------------
-             *
-             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
-             *
-             */
-
-            /*
-             * W = A1 + A2 * V':
-             *      W = A1
-             *      W = W + A2 * V'
-             *
-             */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
-                               M1 * sizeof(cuDoubleComplex), K,
-                               cudaMemcpyDeviceToDevice, stream );
-
-            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
-            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
-
-            if (transW == MagmaNoTrans){
-                cublasTransW = CUBLAS_OP_N;
-            }else if(transW == MagmaTrans){
-                cublasTransW = CUBLAS_OP_T;
-            }else if(transW == MagmaConjTrans){
-                cublasTransW = CUBLAS_OP_C;
-            }else{
-                fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
-            }
-            if (transA2 == MagmaNoTrans){
-                cublasTransA2 = CUBLAS_OP_N;
-            }else if(transA2 == MagmaTrans){
-                cublasTransA2 = CUBLAS_OP_T;
-            }else if(transA2 == MagmaConjTrans){
-                cublasTransA2 = CUBLAS_OP_C;
-            }else{
-                fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2);
-            }
-
-            cublasZgemm(handle, CUBLAS_OP_N, cublasTransW,
-                        M1, K, N2,
-                        (const cuDoubleComplex *) &zone,
-                        (const cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
-                        (const cuDoubleComplex*)V     /* N2*K  */, LDV,
-                        (const cuDoubleComplex *) &zone,
-                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
-
-            if (WORKC == NULL) {
-                /* W = W * op(T) */
-                cublasZtrmm( handle,
-                    CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
-                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
-                    M2, K,
-                    (const cuDoubleComplex *) &zone,
-                    (const cuDoubleComplex*)T, LDT,
-                    (cuDoubleComplex*)WORK, LDWORK,
-                    (cuDoubleComplex*)WORK, LDWORK);
-
-
-                /* A1 = A1 - W = A1 - W * op(T) */
-                for(j = 0; j < K; j++) {
-                    cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone,
-                        (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
-                        (cuDoubleComplex*)(A1 + LDA1*j), 1);
-                }
-
-                /* A2 = A2 - W * op(V)  */
-                cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2,
-                            M2, N2, K,
-                            (const cuDoubleComplex *) &mzone,
-                            (const cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
-                            (const cuDoubleComplex*)V     /* K*N2  */, LDV,
-                            (const cuDoubleComplex *) &zone,
-                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
-
-            } else {
-                /* A1 = A1 - W * opt(T) */
-                cublasZgemm( handle, CUBLAS_OP_N, cublasTrans,
-                    M1, K, K,
-                    (const cuDoubleComplex *) &mzone,
-                    (const cuDoubleComplex *)WORK, LDWORK,
-                    (const cuDoubleComplex *)T,    LDT,
-                    (const cuDoubleComplex *) &zone,
-                    (cuDoubleComplex *)A1,   LDA1 );
-
-                /* Wc = op(T) * V */
-                cublasZgemm( handle, cublasTrans, cublasTransA2,
-                             K, N2, K,
-                             (const cuDoubleComplex *) &zone,
-                             (const cuDoubleComplex *)T,     LDT,
-                             (const cuDoubleComplex *)V,     LDV,
-                             (const cuDoubleComplex *) &zzero,
-                             (cuDoubleComplex *)WORKC, LDWORKC );
-
-                /* A2 = A2 - W * Wc */
-                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                             M2, N2, K,
-                             (const cuDoubleComplex *) &mzone,
-                             (const cuDoubleComplex *)WORK,  LDWORK,
-                             (const cuDoubleComplex *)WORKC, LDWORKC,
-                             (const cuDoubleComplex *) &zone,
-                             (cuDoubleComplex *)A2,    LDA2 );
-            }
-        }
-    }
-    else {
-        fprintf(stderr, "Not implemented (Backward / Left or Right)");
-        return MAGMA_ERR_NOT_SUPPORTED;
-    }
-
-    cublasDestroy(handle);
-
-    return MAGMA_SUCCESS;
-}
-#else /* CHAMELEON_USE_CUBLAS_V2 */
-magma_int_t
-magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
-                 magma_direct_t direct, magma_storev_t storev,
-                 magma_int_t M1, magma_int_t N1,
-                 magma_int_t M2, magma_int_t N2,
-                 magma_int_t K, magma_int_t L,
-                       magmaDoubleComplex *A1, magma_int_t LDA1,
-                       magmaDoubleComplex *A2, magma_int_t LDA2,
-                 const magmaDoubleComplex *V, magma_int_t LDV,
-                 const magmaDoubleComplex *T, magma_int_t LDT,
-                       magmaDoubleComplex *WORK, magma_int_t LDWORK,
-                       magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
-                       CUstream stream)
-{
-#if defined(PRECISION_z) || defined(PRECISION_c)
-    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
-    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
-    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
-#else
-    double zzero = 0.0;
-    double zone  = 1.0;
-    double mzone = -1.0;
-#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
-
-    int j;
-    magma_trans_t transW;
-    magma_trans_t transA2;
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
-        return -3;
-    }
-    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
-        return -4;
-    }
-    if (M1 < 0) {
-        return -5;
-    }
-    if (N1 < 0) {
-        return -6;
-    }
-    if ((M2 < 0) ||
-        ( (side == MagmaRight) && (M1 != M2) ) ) {
-        return -7;
-    }
-    if ((N2 < 0) ||
-        ( (side == MagmaLeft) && (N1 != N2) ) ) {
-        return -8;
-    }
-    if (K < 0) {
-        return -9;
-    }
-
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
-        return MAGMA_SUCCESS;
-
-    if (direct == MagmaForward) {
-
-        if (side == MagmaLeft) {
-
-            /*
-             * Column or Rowwise / Forward / Left
-             * ----------------------------------
-             *
-             * Form  H * A  or  H' * A  where  A = ( A1 )
-             *                                     ( A2 )
-             */
-
-            /*
-             * W = A1 + V' * A2:
-             *      W = A1
-             *      W = W + V' * A2
-             *
-             */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
-                               K * sizeof(cuDoubleComplex), N1,
-                               cudaMemcpyDeviceToDevice, stream );
-
-            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
-            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
-
-            cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans),
-                        K, N1, M2,
-                        zone,
-                        (cuDoubleComplex*)V     /* K*M2  */, LDV,
-                        (cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
-                        zone,
-                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
-
-            if (WORKC == NULL) {
-                /* W = op(T) * W */
-                cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper),
-                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
-                             K, N2,
-                             zone,
-                             (cuDoubleComplex*)T, LDT,
-                             (cuDoubleComplex*)WORK, LDWORK);
-
-
-                /* A1 = A1 - W = A1 - op(T) * W */
-                for(j = 0; j < N1; j++) {
-                    cublasZaxpy(K, mzone,
-                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
-                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
-                }
-
-                /* A2 = A2 - op(V) * W  */
-                cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans),
-                            M2, N2, K,
-                            mzone,
-                            (cuDoubleComplex*)V     /* M2*K  */, LDV,
-                            (cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
-                            zone,
-                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
-
-            } else {
-                /* Wc = V * op(T) */
-                cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans),
-                             M2, K, K,
-                             zone,  V,     LDV,
-                                    T,     LDT,
-                             zzero, WORKC, LDWORKC );
-
-                /* A1 = A1 - opt(T) * W */
-                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans),
-                             K, N1, K,
-                             mzone, T,    LDT,
-                                    WORK, LDWORK,
-                             zone,  A1,   LDA1 );
-
-                /* A2 = A2 - Wc * W */
-                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
-                             M2, N2, K,
-                             mzone, WORKC, LDWORKC,
-                                    WORK,  LDWORK,
-                             zone,  A2,    LDA2 );
-            }
-        }
-        else {
-            /*
-             * Column or Rowwise / Forward / Right
-             * -----------------------------------
-             *
-             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
-             *
-             */
-
-            /*
-             * W = A1 + A2 * V':
-             *      W = A1
-             *      W = W + A2 * V'
-             *
-             */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
-                               M1 * sizeof(cuDoubleComplex), K,
-                               cudaMemcpyDeviceToDevice, stream );
-
-            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
-            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
-
-            cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW),
-                        M1, K, N2,
-                        zone,
-                        (cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
-                        (cuDoubleComplex*)V     /* N2*K  */, LDV,
-                        zone,
-                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
-
-            if (WORKC == NULL) {
-                /* W = W * op(T) */
-                cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper),
-                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
-                             M2, K,
-                             zone,
-                             (cuDoubleComplex*)T, LDT,
-                             (cuDoubleComplex*)WORK, LDWORK);
-
-
-                /* A1 = A1 - W = A1 - W * op(T) */
-                for(j = 0; j < K; j++) {
-                    cublasZaxpy(M1, mzone,
-                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
-                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
-                }
-
-                /* A2 = A2 - W * op(V)  */
-                cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2),
-                            M2, N2, K,
-                            mzone,
-                            (cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
-                            (cuDoubleComplex*)V     /* K*N2  */, LDV,
-                            zone,
-                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
-
-            } else {
-                /* A1 = A1 - W * opt(T) */
-                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans),
-                             M1, K, K,
-                             mzone, WORK, LDWORK,
-                                    T,    LDT,
-                             zone,  A1,   LDA1 );
-
-                /* Wc = op(T) * V */
-                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2),
-                             K, N2, K,
-                             zone,  T,     LDT,
-                                    V,     LDV,
-                             zzero, WORKC, LDWORKC );
-
-                /* A2 = A2 - W * Wc */
-                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
-                             M2, N2, K,
-                             mzone, WORK,  LDWORK,
-                                    WORKC, LDWORKC,
-                             zone,  A2,    LDA2 );
-            }
-        }
-    }
-    else {
-        fprintf(stderr, "Not implemented (Backward / Left or Right)");
-        return MAGMA_ERR_NOT_SUPPORTED;
-    }
-
-    return MAGMA_SUCCESS;
-}
-#endif /* CHAMELEON_USE_CUBLAS_V2 */
-
-magma_int_t
-magma_ztsmqr_gpu( magma_side_t side, magma_trans_t trans,
-                  magma_int_t M1, magma_int_t N1,
-                  magma_int_t M2, magma_int_t N2,
-                  magma_int_t K, magma_int_t IB,
-                  magmaDoubleComplex *A1, magma_int_t LDA1,
-                  magmaDoubleComplex *A2, magma_int_t LDA2,
-                  const magmaDoubleComplex *V, magma_int_t LDV,
-                  const magmaDoubleComplex *T, magma_int_t LDT,
-                        magmaDoubleComplex *WORK,  magma_int_t LDWORK,
-                        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
-                  CUstream stream)
-{
-    int i, i1, i3;
-    int NQ, NW;
-    int kb;
-    int ic = 0;
-    int jc = 0;
-    int mi = M1;
-    int ni = N1;
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-
-    /* NQ is the order of Q */
-    if (side == MagmaLeft) {
-        NQ = M2;
-        NW = IB;
-    }
-    else {
-        NQ = N2;
-        NW = M1;
-    }
-
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if (M1 < 0) {
-        return -3;
-    }
-    if (N1 < 0) {
-        return -4;
-    }
-    if ( (M2 < 0) ||
-         ( (M2 != M1) && (side == MagmaRight) ) ){
-        return -5;
-    }
-    if ( (N2 < 0) ||
-         ( (N2 != N1) && (side == MagmaLeft) ) ){
-        return -6;
-    }
-    if ((K < 0) ||
-        ( (side == MagmaLeft)  && (K > M1) ) ||
-        ( (side == MagmaRight) && (K > N1) ) ) {
-        return -7;
-    }
-    if (IB < 0) {
-        return -8;
-    }
-    if (LDA1 < max(1,M1)){
-        return -10;
-    }
-    if (LDA2 < max(1,M2)){
-        return -12;
-    }
-    if (LDV < max(1,NQ)){
-        return -14;
-    }
-    if (LDT < max(1,IB)){
-        return -16;
-    }
-    if (LDWORK < max(1,NW)){
-        return -18;
-    }
-
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
-        return MAGMA_SUCCESS;
-
-    if (((side == MagmaLeft)  && (trans != MagmaNoTrans))
-        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
-        i1 = 0;
-        i3 = IB;
-    }
-    else {
-        i1 = ((K-1) / IB)*IB;
-        i3 = -IB;
-    }
-
-    for(i = i1; (i > -1) && (i < K); i += i3) {
-        kb = min(IB, K-i);
-
-        if (side == MagmaLeft) {
-            /*
-             * H or H' is applied to C(i:m,1:n)
-             */
-            mi = M1 - i;
-            ic = i;
-        }
-        else {
-            /*
-             * H or H' is applied to C(1:m,i:n)
-             */
-            ni = N1 - i;
-            jc = i;
-        }
-        /*
-         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
-         */
-        magma_zparfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
-                          mi, ni, M2, N2, kb, 0,
-                          A1 + LDA1*jc+ic, LDA1,
-                          A2, LDA2,
-                          V + LDV*i, LDV,
-                          T + LDT*i, LDT,
-                          WORK, LDWORK, WORKC, LDWORKC, stream );
-    }
-    return MAGMA_SUCCESS;
-}
-
 static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -957,9 +280,10 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
     stream = starpu_cuda_get_local_stream();
     cublasSetKernelStream( stream );
 
-    magma_ztsmqr_gpu( side, trans, m1, n1, m2, n2, k, ib,
-                      A1, lda1, A2, lda2, V, ldv, T, ldt,
-                      W, ldwork, WC, ldworkc, stream );
+    CUDA_ztsmqr(
+            side, trans, m1, n1, m2, n2, k, ib,
+            A1, lda1, A2, lda2, V, ldv, T, ldt,
+            W, ldwork, WC, ldworkc, stream );
 
 #ifndef STARPU_CUDA_ASYNC
     cudaStreamSynchronize( stream );
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
index 8fd3383be..8fd166263 100644
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ b/runtime/starpu/codelets/codelet_ztsqrt.c
@@ -160,183 +160,7 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
     CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
 
-
 #if defined(CHAMELEON_USE_MAGMA)
-
-magma_int_t
-magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
-                   magmaDoubleComplex *da1, magma_int_t ldda1,
-                   magmaDoubleComplex *da2, magma_int_t ldda2,
-                   magmaDoubleComplex *a2,  magma_int_t lda2,
-                   magmaDoubleComplex *dt,  magma_int_t lddt,
-                   magmaDoubleComplex *t,  magma_int_t ldt,
-                   magmaDoubleComplex *dd,
-                   magmaDoubleComplex *d,  magma_int_t ldd,
-                   magmaDoubleComplex *tau,
-                   magmaDoubleComplex *hwork,
-                   magmaDoubleComplex *dwork,
-                   CUstream stream)
-{
-#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1))
-#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1))
-#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1))
-#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1))
-#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1))
-#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1))
-
-	int i, k, lddwork, old_i, old_ib, rows, cols;
-	int ib;
-	double _Complex one=1.;
-//	int lwkopt = n * nb;
-//	hwork[0] = *((magmaDoubleComplex*) &lwkopt);
-
-	if (m < 0) {
-		return -1;
-	} else if (n < 0) {
-		return -2;
-	} else if (ldda2 < max(1,m)) {
-		return -4;
-	}
-
-	k = min(m,n);
-	if (k == 0) {
-		hwork[0] = *((magmaDoubleComplex*) &one);
-		return MAGMA_SUCCESS;
-	}
-
-	lddwork= nb;
-
-	/* lower parts of little T must be zero: memset all to 0 for simplicity */
-	memset(t, 0, nb*nb*sizeof(magmaDoubleComplex));
-	cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
-
-	/* copy the first diag tile of A1 from device to host: da1 -> d */
-	cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex),
-                    da1_ref(0, 0), ldda1,
-                    d, ldd);
-//	cudaMemcpy( d, da1_ref(0,0),
-//	            nb*nb*sizeof(cuDoubleComplex),
-//	            cudaMemcpyDeviceToHost );
-
-	/* copy first panel of A2 from device to host: da2 -> a2 */
-//	cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex),
-//                    da2_ref(0, 0), ldda2,
-//                    a2, lda2);
-	cudaMemcpy( a2, da2_ref(0, 0),
-	            m*nb*sizeof(cuDoubleComplex),
-	            cudaMemcpyDeviceToHost );
-
-	/* This is only blocked code for now */
-	for (i = 0; i < n; i += nb) {
-
-		ib = min(n-i, nb);
-		rows = m;
-
-		/* Send the next panel (diagonal block of A1 & block column of A2)
-		   to the CPU (in work_a1 and work_a2) and apply tsmqr update on the
-		   remaining non updated panels */
-		if (i>0) {
-
-			/* copy the diag tile of A1 from device to host: da1 -> d */
-			cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-		                    da1_ref(i, i), ldda1,
-		                    d, ldd);
-//		    cudaMemcpy( d, da1_ref(i,i),
-//		        ib*ib*sizeof(cuDoubleComplex),
-//		        cudaMemcpyDeviceToHost );
-
-			/* copy panel of A2 from device to host: da2 -> a2 */
-			cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex),
-                            da2_ref(0, i), ldda2,
-                            a2, lda2);
-//            cudaMemcpy( a2, da2_ref(0,i),
-//                rows*ib*sizeof(cuDoubleComplex),
-//                cudaMemcpyDeviceToHost );
-
-			/* Apply H' to A(i:m,i+2*ib:n) from the left */
-			cols = n-old_i-2*old_ib;
-			if (cols > 0){
-				magma_ztsmqr_gpu( MagmaLeft, MagmaConjTrans,
-                                  old_ib, cols, rows, cols, old_ib, old_ib,
-								  da1_ref(old_i, old_i+2*old_ib), ldda1,
-								  da2_ref(0, old_i+2*old_ib), ldda2,
-								  da2_ref(0, old_i), ldda2,
-								  dt_ref(0, old_i), lddt,
-								  dwork, old_ib,
-								  dwork + old_ib * cols, rows,
-								  stream );
-			}
-
-		}
-
-		/* compute QR factorization of the panel of A2 rows x ib */
-		CORE_ztsqrt(rows, ib, ib,
-                    (double _Complex*) d, ldd,
-                    (double _Complex*) a2, lda2,
-                    (double _Complex*) t, ldt,
-                    (double _Complex*) tau,
-                    (double _Complex*) hwork);
-
-		/* Send the panel from A2 back to the GPU */
-		cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex),
-                        a2, lda2,
-                        da2_ref(0, i), ldda2);
-//        cudaMemcpy( da2_ref(0,i), a2,
-//            rows*ib*sizeof(cuDoubleComplex),
-//            cudaMemcpyHostToDevice );
-
-		/* Send the triangular factor T from hwork to the GPU */
-		cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        t, ldt,
-                        dt_ref(0, i), lddt);
-//        cudaMemcpy( dt_ref(0,i), t,
-//            ib*ib*sizeof(cuDoubleComplex),
-//            cudaMemcpyHostToDevice );
-
-		/* get back the diag tile in A1 from host to device: d -> da1 */
-		cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        d, ldd,
-                        da1_ref(i, i), ldda1);
-//        cudaMemcpy( da1_ref(i, i), d,
-//            ib*ib*sizeof(cuDoubleComplex),
-//            cudaMemcpyHostToDevice );
-
-		/* tsmqr update on one panel forward (look ahead 1) */
-		if (i + ib < n) {
-
-			if (i+2*ib < n){
-				cols = ib;
-			}
-			else{
-				cols = n-i-ib;
-			}
-
-			/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
-		    magma_ztsmqr_gpu( MagmaLeft, MagmaConjTrans,
-                              ib, cols, rows, cols, ib, ib,
-                              da1_ref(i, i+ib), ldda1,
-                              da2_ref(0, i+ib), ldda2,
-                              da2_ref(0, i), ldda2,
-                              dt_ref(0, i), lddt,
-                              dwork, ib,
-                              dwork + ib * cols, rows,
-                              stream );
-			old_i = i;
-			old_ib = ib;
-		}
-	}
-
-#undef da1_ref
-#undef da2_ref
-#undef a2_ref
-#undef t_ref
-#undef dt_ref
-#undef d_ref
-
-	return MAGMA_SUCCESS;
-
-} /* magma_ztsqrt_gpu */
-
 static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -366,18 +190,16 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
     h_D   = h_W   + ib*n;
 
     stream = starpu_cuda_get_local_stream();
-    magma_ztsqrt2_gpu( m, n, ib,
-                       d_A1, lda1, d_A2, lda2,
-                       h_A2, lda2,
-                       d_T, ldt, h_T, ib,
-                       d_D, h_D, ib, h_TAU,
-                       h_W, d_W, stream);
-
+    CUDA_ztsqrt(
+            m, n, ib,
+            d_A1, lda1, d_A2, lda2,
+            h_A2, lda2,
+            d_T, ldt, h_T, ib,
+            d_D, h_D, ib, h_TAU,
+            h_W, d_W, stream);
     cudaThreadSynchronize();
 }
-
 #endif
-
 /*
  * Codelet definition
  */
diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c
index e76985b02..cc02ea0f1 100644
--- a/runtime/starpu/codelets/codelet_ztstrf.c
+++ b/runtime/starpu/codelets/codelet_ztstrf.c
@@ -247,13 +247,14 @@ static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
     /* Initialize L to 0 */
     memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex));
 
-    magma_ztstrf_gpu( MagmaColMajor, m, n, ib, nb,
-                      hU, ldu, dU, ldu,
-                      hA, lda, dA, lda,
-                      hL, ldl, dL, ldl,
-                      ipiv,
-                      hw, ldwork, dw, lda,
-                      &info );
+    CUDA_ztstrf(
+            MagmaColMajor, m, n, ib, nb,
+            hU, ldu, dU, ldu,
+            hA, lda, dA, lda,
+            hL, ldl, dL, ldl,
+            ipiv,
+            hw, ldwork, dw, lda,
+            &info );
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index 7dbc938e2..c612cd15c 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -185,114 +185,6 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg)
 }
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-magma_int_t
-magma_zunmlqt_gpu( magma_side_t side, magma_trans_t trans,
-                   magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-                   const magmaDoubleComplex *A,    magma_int_t LDA,
-                   const magmaDoubleComplex *T,    magma_int_t LDT,
-                   magmaDoubleComplex *C,    magma_int_t LDC,
-                   magmaDoubleComplex *WORK, magma_int_t LDWORK )
-{
-    int i, kb;
-    int i1, i3;
-    int nq, nw;
-    int ic = 0;
-    int jc = 0;
-    int ni = N;
-    int mi = M;
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-    /*
-     * NQ is the order of Q and NW is the minimum dimension of WORK
-     */
-    if (side == MagmaLeft) {
-        nq = M;
-        nw = N;
-    }
-    else {
-        nq = N;
-        nw = M;
-    }
-
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if (M < 0) {
-        return -3;
-    }
-    if (N < 0) {
-        return -4;
-    }
-    if ((K < 0) || (K > nq)) {
-        return -5;
-    }
-    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
-        return -6;
-    }
-    if ((LDA < max(1,K)) && (K > 0)) {
-        return -8;
-    }
-    if ((LDC < max(1,M)) && (M > 0)) {
-        return -12;
-    }
-    if ((LDWORK < max(1,nw)) && (nw > 0)) {
-        return -14;
-    }
-
-    /* Quick return */
-    if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
-
-    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
-        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
-        i1 = 0;
-        i3 = IB;
-    }
-    else {
-        i1 = ( ( K-1 ) / IB )*IB;
-        i3 = -IB;
-    }
-
-    if( trans == MorseNoTrans) {
-        trans = MorseConjTrans;
-    }
-    else {
-        trans = MorseNoTrans;
-    }
-
-    for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
-        kb = min(IB, K-i);
-
-        if (side == MagmaLeft) {
-            /*
-             * H or H' is applied to C(i:m,1:n)
-             */
-            mi = M - i;
-            ic = i;
-        }
-        else {
-            /*
-             * H or H' is applied to C(1:m,i:n)
-             */
-            ni = N - i;
-            jc = i;
-        }
-
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
-    }
-    return MAGMA_SUCCESS;
-}
-
-
 static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -314,8 +206,9 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
-    magma_zunmlqt_gpu( side, trans, m, n, k, ib,
-                       A, lda, T, ldt, C, ldc, WORK, ldwork );
+    CUDA_zunmlqt(
+            side, trans, m, n, k, ib,
+            A, lda, T, ldt, C, ldc, WORK, ldwork );
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index dff4ca7f9..0074e5c38 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -209,108 +209,6 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg)
 }
 
 #if defined(CHAMELEON_USE_MAGMA)
-
-magma_int_t
-magma_zunmqrt_gpu( magma_side_t side, magma_trans_t trans,
-                   magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-                   const magmaDoubleComplex *A,    magma_int_t LDA,
-                   const magmaDoubleComplex *T,    magma_int_t LDT,
-                   magmaDoubleComplex *C,    magma_int_t LDC,
-                   magmaDoubleComplex *WORK, magma_int_t LDWORK )
-{
-    int i, kb;
-    int i1, i3;
-    int nq, nw;
-    int ic = 0;
-    int jc = 0;
-    int ni = N;
-    int mi = M;
-
-    /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
-        return -1;
-    }
-    /*
-     * NQ is the order of Q and NW is the minimum dimension of WORK
-     */
-    if (side == MagmaLeft) {
-        nq = M;
-        nw = N;
-    }
-    else {
-        nq = N;
-        nw = M;
-    }
-
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
-        return -2;
-    }
-    if (M < 0) {
-        return -3;
-    }
-    if (N < 0) {
-        return -4;
-    }
-    if ((K < 0) || (K > nq)) {
-        return -5;
-    }
-    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
-        return -6;
-    }
-    if ((LDA < max(1,nq)) && (nq > 0)) {
-        return -8;
-    }
-    if ((LDC < max(1,M)) && (M > 0)) {
-        return -12;
-    }
-    if ((LDWORK < max(1,nw)) && (nw > 0)) {
-        return -14;
-    }
-
-    /* Quick return */
-    if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
-
-    if (((side == MagmaLeft) && (trans != MagmaNoTrans))
-        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
-        i1 = 0;
-        i3 = IB;
-    }
-    else {
-        i1 = ( ( K-1 ) / IB )*IB;
-        i3 = -IB;
-    }
-
-    for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
-        kb = min(IB, K-i);
-
-        if (side == MagmaLeft) {
-            /*
-             * H or H' is applied to C(i:m,1:n)
-             */
-            mi = M - i;
-            ic = i;
-        }
-        else {
-            /*
-             * H or H' is applied to C(1:m,i:n)
-             */
-            ni = N - i;
-            jc = i;
-        }
-
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
-    }
-
-    return MAGMA_SUCCESS;
-}
-
-
 static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -332,8 +230,9 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
-    magma_zunmqrt_gpu( side, trans, m, n, k, ib,
-                       A, lda, T, ldt, C, ldc, WORK, ldwork );
+    CUDA_zunmqrt(
+            side, trans, m, n, k, ib,
+            A, lda, T, ldt, C, ldc, WORK, ldwork );
 
     cudaThreadSynchronize();
 }
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index bb3c5b717..e8316ef5c 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -123,6 +123,11 @@ endif()
 
 if(NOT CHAMELEON_SIMULATION)
 
+    if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_tests
+        cudablas
+        )
+    endif()
     if(CHAMELEON_USE_CUDA)
         list(APPEND libs_for_tests
         ${CUDA_LIBRARIES}
diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt
index 9d8727434..c15b1e223 100644
--- a/timing/CMakeLists.txt
+++ b/timing/CMakeLists.txt
@@ -169,6 +169,11 @@ endif()
 
 if(NOT CHAMELEON_SIMULATION)
 
+    if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_timings
+        cudablas
+        )
+    endif()
     if(CHAMELEON_USE_CUDA)
         list(APPEND libs_for_timings
         ${CUDA_LIBRARIES}
@@ -185,7 +190,7 @@ if(NOT CHAMELEON_SIMULATION)
     list(APPEND libs_for_timings
     coreblas
     ${LAPACKE_LIBRARIES}
-    ${CBLAS_LIBRARIES}    
+    ${CBLAS_LIBRARIES}
     ${LAPACK_SEQ_LIBRARIES}
     ${BLAS_SEQ_LIBRARIES}
     ${HWLOC_LIBRARIES}
-- 
GitLab