From 32de9c8d2443ffa5d778921a5f8f46a974a87e40 Mon Sep 17 00:00:00 2001
From: Florent Pruvost <florent.pruvost@inria.fr>
Date: Wed, 13 Apr 2016 15:46:18 +0000
Subject: [PATCH] fix bug when minMT=1 in pzgelqf

---
 compute/pzgelqf.c                        |   2 +-
 coreblas/compute/core_zgelqt.c           |  16 ++
 cudablas/compute/cuda_zgelqt.c           | 199 ++++++++++++-----------
 cudablas/compute/cuda_ztslqt.c           |  30 ++--
 runtime/starpu/codelets/codelet_ztslqt.c |   4 +-
 timing/CMakeLists.txt                    |   2 +
 timing/CTestLists.cmake                  |   1 +
 timing/time_zgelqf.c                     |  80 +++++++++
 timing/time_zgelqf_tile.c                |  81 +++++++++
 timing/time_zgeqrf_tile.c                |   9 +
 10 files changed, 308 insertions(+), 116 deletions(-)
 create mode 100644 timing/time_zgelqf.c
 create mode 100644 timing/time_zgelqf_tile.c

diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 4609398fb..f8969bc0b 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -104,7 +104,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     /* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */
     DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb, A->p, A->q);
+    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
 
     for (k = 0; k < min(A->mt, A->nt); k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
diff --git a/coreblas/compute/core_zgelqt.c b/coreblas/compute/core_zgelqt.c
index 8268be841..c7ca8c12d 100644
--- a/coreblas/compute/core_zgelqt.c
+++ b/coreblas/compute/core_zgelqt.c
@@ -156,6 +156,22 @@ int CORE_zgelqt(int M, int N, int IB,
                 WORK, M-i-sb);
         }
     }
+    // print the matrices
+//    int i1, i2;
+//    printf("\nV \n");
+//    for (i1=0; i1<M; i1++){
+//        for (i2=0; i2<N; i2++){
+//            printf("%f ", A[i1+i2*LDA]);
+//        }
+//        printf("\n");
+//    }
+//    printf("\nT \n");
+//    for (i1=0; i1<IB; i1++){
+//        for (i2=0; i2<N; i2++){
+//            printf("%f ", T[i1+i2*LDT]);
+//        }
+//        printf("\n");
+//    }
     return MORSE_SUCCESS;
 }
 
diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c
index 17f916843..ff18cc171 100644
--- a/cudablas/compute/cuda_zgelqt.c
+++ b/cudablas/compute/cuda_zgelqt.c
@@ -43,7 +43,7 @@ int CUDA_zgelqt(
 #define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
 #define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))
 
-    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
+    int i, k, ib, old_i, old_ib, rows, cols;
     double _Complex one=1.;
 
     if (m < 0) {
@@ -60,107 +60,110 @@ int CUDA_zgelqt(
     return MAGMA_SUCCESS;
     }
 
-    lddwork= m;
-
     /* lower parts of little T must be zero: memset to 0 for simplicity */
     memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
-    cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
-
-    /* copy first panel of A on the host */
-    cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
-              da_ref(0, 0), ldda,
-              v, ldv);
-
-    /* Use blocked code initially */
-    for (i = 0; i < k; i += nb) {
-
-    ib = min(k-i, nb);
-    if (i+nb >= m) ib = min(m-i, nb);
-    cols = n-i;
-
-    if (i > 0){
-
-      /* copy panel of A from device to host */
-      cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
-                      da_ref(i, 0), ldda,
-                      v, ldv);
-
-      /* Apply H' to A(i+2*ib:m, i:n) from the right */
-      rows = m-old_i-2*old_ib;
-      if (rows > 0){
-          magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                            rows, n-old_i, old_ib,
-                            da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                            da_ref(old_i+2*old_ib, old_i), ldda,
-                            dwork, lddwork);
-      }
-
-      /* copy the lower diag tile into d_A */
-      CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
-                    dd, ldd, da_ref(old_i, old_i), ldda, stream);
-
-    }
-
-    /* Form the triangular factor of the block reflector on the host
-    H = H'(i+ib-1) . . . H(i+1) H(i) */
-    CORE_zgelqt(ib, cols, ib,
-              (double _Complex*) v_ref(0,i), ldv,
-              (double _Complex*) t_ref(0,0), ldt,
-              (double _Complex*) tau+i,
-              (double _Complex*) hwork);
-
-    if ( i + ib < m ){
-      /* put 0s in the lower triangular part of a panel (and 1s on the
-       diagonal); copy the lower triangular in d */
-      CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
-                    (double _Complex*) v_ref(0,i), ldv,
-                    (double _Complex*) d, ldd);
-
-      /* copy from host to device a tile diag */
-      cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
-                       d, ldd, dd, ldd );
-    }
-
-    /* Send the triangular factor T to the GPU */
-    cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
-                   t_ref(0,0), ldt, dt_ref(0,i), lddt );
-
-    /* A panel (with zeros in lower tri of its diag) is ready to be used
-    in input of zlarfb_gpu: we send the panel to the gpu */
-    cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
-                   v_ref(0,i), ldv, da_ref(i,i), ldda );
-
-    if (i + ib < m) {
-
-      if (i+2*ib < m){
-          rows = ib;
-      }
-      else{
-          rows = m-i-ib;
-      }
-      /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
-      magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                        rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
-                        lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
-      cudaThreadSynchronize();
-      old_i = i;
-      old_ib = ib;
-      if (i+nb >= k){
-          /* Apply H' to A(i+2*ib:m, i:n) from the right */
-          rows = m-old_i-2*old_ib;
-          if (rows > 0){
-              magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
-                                rows, cols, old_ib,
-                                da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
-                                da_ref(old_i+2*old_ib, old_i), ldda,
-                                dwork, lddwork);
-          }
-          /* copy the upper diag tile into d_A */
-          CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
-                        dd, ldd, da_ref(old_i, old_i), ldda, stream);
-      }
+    cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
+
+    if ( (nb > 1) && (nb < k) ) {
+        /* Use blocked code initially */
+        old_i = 0; old_ib = nb;
+        for (i = 0; i < k-nb; i += nb) {
+
+            ib = min(k-i, nb);
+            cols = n-i;
+            magma_zgetmatrix_async( ib, cols,
+                                    da_ref(i,i), ldda,
+                                    v_ref(0,i), ib, stream );
+
+            if (i>0){
+                /* Apply H' to A(i+2*ib:m, i:n) from the right */
+                rows = m-old_i-2*old_ib;
+                magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                                  rows, n-old_i, old_ib,
+                                  da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
+                                  da_ref(old_i+2*old_ib, old_i), ldda,
+                                  dwork, rows);
+
+                /* store the diagonal */
+                magma_zsetmatrix_async( old_ib, old_ib,
+                                        d,                    old_ib,
+                                        da_ref(old_i, old_i), ldda, stream );
+            }
+
+            magma_queue_sync( stream );
+            /* Form the triangular factor of the block reflector on the host
+            H = H'(i+ib-1) . . . H(i+1) H(i) */
+            CORE_zgelqt(ib, cols, ib,
+                        (double _Complex*) v_ref(0,i), ib,
+                        (double _Complex*) t_ref(0,0), ib,
+                        (double _Complex*) tau+i,
+                        (double _Complex*) hwork);
+
+            /* put 0s in the lower triangular part of a panel (and 1s on the
+              diagonal); copy the lower triangular in d */
+            CORE_zgesplit(MorseRight, MorseUnit, ib, min(ib,cols),
+                          (double _Complex*) v_ref(0,i), ib,
+                          (double _Complex*) d, ib);
+
+            /* send the custom panel to the GPU */
+            magma_zsetmatrix( ib, cols,
+                              v_ref(0, i), ib,
+                              da_ref(i, i), ldda );
+
+            if ( i + ib < n ){
+                /* Send the triangular factor T to the GPU */
+                magma_zsetmatrix( ib, ib,
+                                  t_ref(0, 0), ib,
+                                  dt_ref(0, i), lddt );
+
+                if (i+nb < k-nb) {
+                    /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
+                    magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                                      ib, cols, ib,
+                                      da_ref(i,   i), ldda, dt_ref(0,i), lddt,
+                                      da_ref(i+ib,i), ldda, dwork, ib);
+                }
+                else {
+                    rows = m-i-ib;
+                    magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                                      rows, cols, ib,
+                                      da_ref(i,   i), ldda, dt_ref(0,i), lddt,
+                                      da_ref(i+ib,i), ldda, dwork, rows);
+                    cudaThreadSynchronize();
+                    /* Fix the diagonal block */
+                    magma_zsetmatrix_async( ib, ib,
+                                            d,            ib,
+                                            da_ref(i, i), ldda,
+                                            stream );
+                }
+                old_i  = i;
+                old_ib = ib;
+            }
+        }
+    } else {
+        i = 0;
     }
 
+    /* Use unblocked code to factor the last or only block. */
+    if (i < k) {
+        ib   = m-i;
+        cols = n-i;
+        magma_zgetmatrix( ib, cols,
+                          da_ref(i,i), ldda,
+                          v_ref(0,i), ib );
+        CORE_zgelqt(ib, cols, ib,
+                    (double _Complex*) v_ref(0,i), ib,
+                    (double _Complex*) t_ref(0,0), ib,
+                    (double _Complex*) tau+i,
+                    (double _Complex*) hwork);
+        /* send the last factorized panel to the GPU */
+        magma_zsetmatrix( ib, cols,
+                          v_ref(0, i), ib,
+                          da_ref(i, i), ldda );
+        /* Send the triangular factor T to the GPU */
+        magma_zsetmatrix( ib, cols,
+                          t_ref(0, 0), ib,
+                          dt_ref(0, i), lddt );
     }
 
 #undef da_ref
diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c
index aeba03dce..a129d69ea 100644
--- a/cudablas/compute/cuda_ztslqt.c
+++ b/cudablas/compute/cuda_ztslqt.c
@@ -24,7 +24,7 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_MAGMA) && 0
 int CUDA_ztslqt(
         magma_int_t m, magma_int_t n, magma_int_t nb,
         magmaDoubleComplex *da1, magma_int_t ldda1,
@@ -70,16 +70,16 @@ int CUDA_ztslqt(
     memset(t, 0, nb*n*sizeof(magmaDoubleComplex));
     cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
 
-    k = min(m, nb); // m can be lower than IB
+    //k = min(m, nb); // m can be lower than IB
     /* copy the first diag tile of A1 from device to host: da1 -> d */
-    cublasGetMatrix(k, k, sizeof(magmaDoubleComplex),
+    cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex),
                     da1_ref(0, 0), ldda1,
-                    d, ldd);
+                    d, nb);
 
     /* copy first panel of A2 from device to host: da2 -> a2 */
-    cublasGetMatrix(k, n, sizeof(magmaDoubleComplex),
+    cublasGetMatrix(nb, n, sizeof(magmaDoubleComplex),
                     da2_ref(0, 0), ldda2,
-                    a2, lda2);
+                    a2, nb);
 
     /* This is only blocked code for now */
     for (i = 0; i < m; i += nb) {
@@ -95,12 +95,12 @@ int CUDA_ztslqt(
             /* copy the diag tile of A1 from device to host: da1 -> d */
             cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
                             da1_ref(i, i), ldda1,
-                            d, ldd);
+                            d, ib);
 
             /* copy panel of A2 from device to host: da2 -> a2 */
             cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex),
                             da2_ref(i, 0), ldda2,
-                            a2, lda2);
+                            a2, ib);
 
             /* Apply H' to A(i+2*ib:m,i:n) from the left */
             rows = m-old_i-2*old_ib;
@@ -121,25 +121,25 @@ int CUDA_ztslqt(
 
         /* compute LQ factorization of the panel of A2 ib x cols */
         CORE_ztslqt(ib, cols, ib,
-                    (double _Complex*) d, ldd,
-                    (double _Complex*) a2, lda2,
-                    (double _Complex*) t, ldt,
+                    (double _Complex*) d, ib,
+                    (double _Complex*) a2, ib,
+                    (double _Complex*) t, ib,
                     (double _Complex*) tau,
                     (double _Complex*) hwork);
 
         /* Send the panel from A2 back to the GPU */
         cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
-                        a2, lda2,
+                        a2, ib,
                         da2_ref(i, 0), ldda2);
 
         /* Send the triangular factor T from hwork to the GPU */
-        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        t, ldt,
+        cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
+                        t, ib,
                         dt_ref(0, i), lddt);
 
         /* get back the diag tile in A1 from host to device: d -> da1 */
         cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
-                        d, ldd,
+                        d, ib,
                         da1_ref(i, i), ldda1);
 
         /* tsmlq update on one panel forward (look ahead 1) */
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
index 264d50c51..4cf727d50 100644
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ b/runtime/starpu/codelets/codelet_ztslqt.c
@@ -170,7 +170,7 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
     CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
 
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_MAGMA) && 0
 static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -216,7 +216,7 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA) || defined(CHAMELEON_SIMULATION)
+#if (defined(CHAMELEON_USE_MAGMA) && 0) || defined(CHAMELEON_SIMULATION)
 CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0)
 #else
 CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt
index 1183d0348..b098bda36 100644
--- a/timing/CMakeLists.txt
+++ b/timing/CMakeLists.txt
@@ -83,6 +83,8 @@ set(ZSRC
     time_zgels_tile.c
     time_zgeqrf.c
     time_zgeqrf_tile.c
+    time_zgelqf.c
+    time_zgelqf_tile.c
     time_zgeqrs_tile.c
     time_zgetrf_incpiv.c
     time_zgetrf_incpiv_tile.c
diff --git a/timing/CTestLists.cmake b/timing/CTestLists.cmake
index dc2093c52..56dc2d7f8 100644
--- a/timing/CTestLists.cmake
+++ b/timing/CTestLists.cmake
@@ -23,6 +23,7 @@ set(TESTLIST
     getrf_incpiv
     getrf_nopiv
     geqrf
+    gelqf
     posv
     potrf
     potri
diff --git a/timing/time_zgelqf.c b/timing/time_zgelqf.c
new file mode 100644
index 000000000..bf891e5df
--- /dev/null
+++ b/timing/time_zgelqf.c
@@ -0,0 +1,80 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2014 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#define _TYPE  MORSE_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "MORSE_zgelqf"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_GELQF(M, N)
+#define _FADDS FADDS_GELQF(M, N)
+
+#include "./timing.c"
+#include "timing_zauxiliary.h"
+
+static int
+RunTest(int *iparam, double *dparam, morse_time_t *t_)
+{
+    MORSE_desc_t *T;
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    if ( M != N && check ) {
+        fprintf(stderr, "Check cannot be perfomed with M != N\n");
+        check = 0;
+    }
+
+    /* Allocate Data */
+    PASTE_CODE_ALLOCATE_MATRIX( A, 1, MORSE_Complex64_t, LDA, N );
+
+    /* Initialize Data */
+    MORSE_zplrnt(M, N, A, LDA, 3456);
+
+    /* Allocate Workspace */
+    MORSE_Alloc_Workspace_zgels(M, N, &T, P, Q);
+    memset(T->mat, 0, (T->llm*T->lln)*sizeof(MorseComplexDouble));
+
+    /* Save AT in lapack layout for check */
+    PASTE_CODE_ALLOCATE_COPY( Acpy, check, MORSE_Complex64_t, A, LDA, N );
+
+    START_TIMING();
+    MORSE_zgelqf( M, N, A, LDA, T );
+    STOP_TIMING();
+
+    /* Check the solution */
+    if ( check )
+    {
+        PASTE_CODE_ALLOCATE_MATRIX( X, 1, MORSE_Complex64_t, LDB, NRHS );
+        MORSE_zplrnt( N, NRHS, X, LDB, 5673 );
+        PASTE_CODE_ALLOCATE_COPY( B, 1, MORSE_Complex64_t, X, LDB, NRHS );
+
+        MORSE_zgelqs(M, N, NRHS, A, LDA, T, X, LDB);
+
+        dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, Acpy, LDA, B, X, LDB,
+                                              &(dparam[IPARAM_ANORM]),
+                                              &(dparam[IPARAM_BNORM]),
+                                              &(dparam[IPARAM_XNORM]));
+
+        free( Acpy );
+        free( B );
+        free( X );
+      }
+
+    /* Free Workspace */
+    MORSE_Dealloc_Workspace( &T );
+    free( A );
+
+    return 0;
+}
diff --git a/timing/time_zgelqf_tile.c b/timing/time_zgelqf_tile.c
new file mode 100644
index 000000000..72efa19e1
--- /dev/null
+++ b/timing/time_zgelqf_tile.c
@@ -0,0 +1,81 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2014 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#define _TYPE  MORSE_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "MORSE_zgelqf_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_GELQF( M, N )
+#define _FADDS FADDS_GELQF( M, N )
+
+#include "./timing.c"
+
+static int
+RunTest(int *iparam, double *dparam, morse_time_t *t_)
+{
+    MORSE_desc_t *descT;
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    /* Allocate Data */
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA,  1, MORSE_Complex64_t, MorseComplexDouble, LDA, M, N );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descX,  ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N    );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB,  ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
+
+    MORSE_zplrnt_Tile( descA, 5373 );
+
+    /* Save A for check */
+    if (check == 1 && M == N){
+        MORSE_zlacpy_Tile(MorseUpperLower, descA, descAC);
+    }
+
+    /* Allocate Workspace */
+    MORSE_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
+    memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(MorseComplexDouble));
+
+    /* MORSE ZGEQRF */
+    START_TIMING();
+    MORSE_zgelqf_Tile( descA, descT );
+    STOP_TIMING();
+
+    /* Check the solution */
+    if ( check && M == N )
+    {
+        /* Initialize and save B */
+        MORSE_zplrnt_Tile( descX, 2264 );
+        MORSE_zlacpy_Tile(MorseUpperLower, descX, descB);
+
+        /* Compute the solution */
+        MORSE_zgelqs_Tile( descA, descT, descX );
+
+        /* Check solution */
+        dparam[IPARAM_ANORM] = MORSE_zlange_Tile(MorseInfNorm, descAC);
+        dparam[IPARAM_BNORM] = MORSE_zlange_Tile(MorseInfNorm, descB);
+        dparam[IPARAM_XNORM] = MORSE_zlange_Tile(MorseInfNorm, descX);
+        MORSE_zgemm_Tile( MorseNoTrans, MorseNoTrans, 1.0, descAC, descX, -1.0, descB );
+        dparam[IPARAM_RES] = MORSE_zlange_Tile(MorseInfNorm, descB);
+        PASTE_CODE_FREE_MATRIX( descX  )
+        PASTE_CODE_FREE_MATRIX( descAC )
+        PASTE_CODE_FREE_MATRIX( descB  )
+    }
+
+    /* Free data */
+    MORSE_Dealloc_Workspace(&descT);
+    PASTE_CODE_FREE_MATRIX( descA )
+
+    return 0;
+}
diff --git a/timing/time_zgeqrf_tile.c b/timing/time_zgeqrf_tile.c
index 9980e56af..af1241da6 100644
--- a/timing/time_zgeqrf_tile.c
+++ b/timing/time_zgeqrf_tile.c
@@ -23,6 +23,7 @@
 #define _FADDS FADDS_GEQRF( M, N )
 
 #include "./timing.c"
+#include <starpu.h>
 
 static int
 RunTest(int *iparam, double *dparam, morse_time_t *t_)
@@ -36,6 +37,14 @@ RunTest(int *iparam, double *dparam, morse_time_t *t_)
     PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N    );
     PASTE_CODE_ALLOCATE_MATRIX_TILE( descB,  ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
 
+
+//    RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CUDA );
+//    RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CPU );
+//    RUNTIME_zlocality_onerestrict( MORSE_UNMQR, STARPU_CUDA );
+//    RUNTIME_zlocality_onerestrict( MORSE_TSQRT, STARPU_CPU );
+//    RUNTIME_zlocality_onerestrict( MORSE_TSMQR, STARPU_CUDA );
+//    RUNTIME_zlocality_allrestrict( STARPU_CUDA );
+
     MORSE_zplrnt_Tile( descA, 5373 );
 
     /* Save A for check */
-- 
GitLab