From f676d5243c6f732f8da9cf5b4d55a827bb2509a9 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Fri, 8 Feb 2019 10:49:26 +0100
Subject: [PATCH] Starpu/alloc on the fly

---
 compute/pzlange.c                           | 45 ++++++++-------
 compute/pzlansy.c                           | 23 ++++----
 compute/pzunmlq_param.c                     |  6 ++
 compute/pzunmqr_param.c                     |  6 ++
 compute/zlaset.c                            |  2 +-
 control/compute_z.h                         |  7 ++-
 control/descriptor.c                        | 34 ++++++-----
 control/workspace.c                         |  6 +-
 coreblas/compute/core_zgelqt.c              |  2 +-
 include/chameleon/tasks_z.h                 |  8 +++
 runtime/openmp/codelets/codelet_zgelqt.c    |  7 ++-
 runtime/openmp/codelets/codelet_zgeqrt.c    |  7 ++-
 runtime/openmp/codelets/codelet_ztplqt.c    |  6 +-
 runtime/openmp/codelets/codelet_ztpqrt.c    |  6 +-
 runtime/parsec/codelets/codelet_zgelqt.c    |  1 +
 runtime/parsec/codelets/codelet_zgeqrt.c    |  1 +
 runtime/parsec/codelets/codelet_ztplqt.c    |  1 +
 runtime/parsec/codelets/codelet_ztpqrt.c    |  1 +
 runtime/quark/codelets/codelet_zgelqt.c     |  1 +
 runtime/quark/codelets/codelet_zgeqrt.c     |  1 +
 runtime/quark/codelets/codelet_ztplqt.c     |  1 +
 runtime/quark/codelets/codelet_ztpqrt.c     |  1 +
 runtime/starpu/codelets/codelet_zgelqt.c    | 61 ++++++++++----------
 runtime/starpu/codelets/codelet_zgemm.c     |  2 +-
 runtime/starpu/codelets/codelet_zgeqrt.c    | 62 ++++++++++-----------
 runtime/starpu/codelets/codelet_zlange.c    |  8 +--
 runtime/starpu/codelets/codelet_ztplqt.c    |  1 +
 runtime/starpu/codelets/codelet_ztpqrt.c    |  1 +
 runtime/starpu/control/runtime_descriptor.c |  2 +-
 runtime/starpu/control/runtime_options.c    |  6 +-
 testing/testing_zgels.c                     |  1 -
 testing/testing_zgels_hqr.c                 |  2 -
 testing/testing_zgels_systolic.c            |  2 -
 timing/time_zgelqf.c                        |  1 -
 timing/time_zgelqf_tile.c                   |  1 -
 timing/time_zgels.c                         |  9 ++-
 timing/time_zgels_tile.c                    |  3 +-
 timing/time_zgeqrf.c                        |  1 -
 timing/time_zgeqrf_hqr.c                    |  2 -
 timing/time_zgeqrf_hqr_tile.c               |  2 -
 timing/time_zgeqrf_tile.c                   |  1 -
 timing/time_zgeqrs_tile.c                   |  1 -
 42 files changed, 189 insertions(+), 153 deletions(-)

diff --git a/compute/pzlange.c b/compute/pzlange.c
index 2bb14d7a2..bb7d6b176 100644
--- a/compute/pzlange.c
+++ b/compute/pzlange.c
@@ -72,11 +72,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
             }
 
             if ( m >= P ) {
-                INSERT_TASK_dgeadd(
-                    options,
-                    ChamNoTrans, 1, tempnn, A->nb,
-                    1.0, W( Wcol, m,   n ), 1,
-                    1.0, W( Wcol, m%P, n ), 1 );
+                INSERT_TASK_daxpy(
+                    options, tempnn, 1.,
+                    W( Wcol, m,   n ), 1,
+                    W( Wcol, m%P, n ), 1 );
             }
         }
 
@@ -85,11 +84,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
          *  For each i, W(i, n) = reduce( W(0..P-1, n) )
          */
         for(m = 1; m < P; m++) {
-            INSERT_TASK_dgeadd(
-                options,
-                ChamNoTrans, 1, tempnn, A->nb,
-                1.0, W( Wcol, m, n ), 1,
-                1.0, W( Wcol, 0, n ), 1 );
+            INSERT_TASK_daxpy(
+                options, tempnn, 1.,
+                W( Wcol, m, n ), 1,
+                W( Wcol, 0, n ), 1 );
         }
 
         INSERT_TASK_dlange(
@@ -165,11 +163,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
             }
 
             if ( n >= Q ) {
-                INSERT_TASK_dgeadd(
-                    options,
-                    ChamNoTrans, tempmm, 1, A->mb,
-                    1.0, W( Wcol, m, n  ), tempmm,
-                    1.0, W( Wcol, m, n%Q), tempmm );
+                INSERT_TASK_daxpy(
+                    options, tempmm, 1.,
+                    W( Wcol, m, n   ), 1,
+                    W( Wcol, m, n%Q ), 1 );
             }
         }
 
@@ -178,11 +175,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
          *  For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
          */
         for(n = 1; n < Q; n++) {
-            INSERT_TASK_dgeadd(
-                options,
-                ChamNoTrans, tempmm, 1, A->mb,
-                1.0, W( Wcol, m, n), tempmm,
-                1.0, W( Wcol, m, 0), tempmm );
+            INSERT_TASK_daxpy(
+                options, tempmm, 1.,
+                W( Wcol, m, n ), 1,
+                W( Wcol, m, 0 ), 1 );
         }
 
         INSERT_TASK_dlange(
@@ -407,11 +403,14 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
     case ChamOneNorm:
         RUNTIME_options_ws_alloc( &options, 1, 0 );
 
-        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, A->nb, A->nb,
+        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 1, A->nb, A->nb,
                              workmt, worknt * A->nb, 0, 0, workmt, worknt * A->nb, A->p, A->q,
                              NULL, NULL, NULL );
         wcol_init = 1;
 
+        /*
+         * Use the global allocator for Welt, otherwise flush may free the data before the result is read.
+         */
         chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
                              A->p, worknt, 0, 0, A->p, worknt, A->p, A->q,
                              NULL, NULL, NULL );
@@ -424,7 +423,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
     case ChamInfNorm:
         RUNTIME_options_ws_alloc( &options, A->mb, 0 );
 
-        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb,
+        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
                              workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
                              NULL, NULL, NULL );
         wcol_init = 1;
@@ -522,7 +521,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
     }
     CHAMELEON_Desc_Flush( &Welt, sequence );
     CHAMELEON_Desc_Flush( A, sequence );
-    RUNTIME_sequence_wait(chamctxt, sequence);
+    RUNTIME_sequence_wait( chamctxt, sequence );
 
     *result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
 
diff --git a/compute/pzlansy.c b/compute/pzlansy.c
index 5763df96d..6a9f56fb1 100644
--- a/compute/pzlansy.c
+++ b/compute/pzlansy.c
@@ -81,11 +81,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
         int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
 
         for(n = Q; n < NT; n++) {
-            INSERT_TASK_dgeadd(
-                options,
-                ChamNoTrans, tempmm, 1, A->nb,
-                1.0, W( Wcol, m, n  ), tempmm,
-                1.0, W( Wcol, m, n%Q), tempmm );
+            INSERT_TASK_daxpy(
+                options, tempmm, 1.,
+                W( Wcol, m, n   ), 1,
+                W( Wcol, m, n%Q ), 1 );
         }
 
         /**
@@ -93,11 +92,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
          *  For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
          */
         for(n = 1; n < Q; n++) {
-            INSERT_TASK_dgeadd(
-                options,
-                ChamNoTrans, tempmm, 1, A->mb,
-                1.0, W( Wcol, m, n), tempmm,
-                1.0, W( Wcol, m, 0), tempmm );
+            INSERT_TASK_daxpy(
+                options, tempmm, 1.,
+                W( Wcol, m, n ), 1,
+                W( Wcol, m, 0 ), 1 );
         }
 
         INSERT_TASK_dlange(
@@ -334,11 +332,14 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
     case ChamInfNorm:
         RUNTIME_options_ws_alloc( &options, 1, 0 );
 
-        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb,
+        chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
                              workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
                              NULL, NULL, NULL );
         wcol_init = 1;
 
+        /*
+         * Use the global allocator for Welt, otherwise flush may free the data before the result is read.
+         */
         chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
                              workmt, A->q, 0, 0, workmt, A->q, A->p, A->q,
                              NULL, NULL, NULL );
diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c
index 48dbc13ea..02b740b3b 100644
--- a/compute/pzunmlq_param.c
+++ b/compute/pzunmlq_param.c
@@ -466,6 +466,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
                     RUNTIME_data_flush( sequence, T(k, n) );
                 }
 
+                /* Restore the original location of the tiles */
+                for (m = 0; m < B->mt; m++) {
+                    RUNTIME_data_migrate( sequence, B( m, k ),
+                                          B->get_rankof( B, m, k ) );
+                }
+
                 RUNTIME_iteration_pop(chamctxt);
             }
         }
diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c
index 772bfdf48..a11c5f247 100644
--- a/compute/pzunmqr_param.c
+++ b/compute/pzunmqr_param.c
@@ -467,6 +467,12 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     RUNTIME_data_flush( sequence, T(n, k) );
                 }
 
+                /* Restore the original location of the tiles */
+                for (m = 0; m < B->mt; m++) {
+                    RUNTIME_data_migrate( sequence, B(m, k),
+                                          B->get_rankof( B, m, k ) );
+                }
+
                 RUNTIME_iteration_pop(chamctxt);
             }
         }
diff --git a/compute/zlaset.c b/compute/zlaset.c
index 2b03272b7..7001e66a2 100644
--- a/compute/zlaset.c
+++ b/compute/zlaset.c
@@ -266,7 +266,7 @@ int CHAMELEON_zlaset_Tile_Async( cham_uplo_t uplo,
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
     /* Check input arguments */
-    if (A->nb != A->mb) {
+    if ( (alpha != beta) && (A->nb != A->mb) ) {
         chameleon_error("CHAMELEON_zlaset_Tile_Async", "only square tiles supported");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
diff --git a/control/compute_z.h b/control/compute_z.h
index 3229f1389..3bacf06a1 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -134,7 +134,7 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
 static inline int
 chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int q ) {
     int diag_m = chameleon_min( m, n );
-    return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_GLOBAL,
+    return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_TILE,
                                 ChamComplexDouble, nb, nb, nb*nb,
                                 diag_m, nb, 0, 0, diag_m, nb, p, q,
                                 chameleon_getaddr_diag,
@@ -145,7 +145,7 @@ chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int
 #define chameleon_zdesc_alloc( descA, mb, nb, lm, ln, i, j, m, n, free) \
     {                                                                   \
         int rc;                                                         \
-        rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_GLOBAL, \
+        rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_TILE, \
                                   ChamComplexDouble, (mb), (nb), ((mb)*(nb)), \
                                   (m), (n), (i), (j), (m), (n), 1, 1,   \
                                   NULL, NULL, NULL );                   \
@@ -174,7 +174,7 @@ chameleon_zlap2tile( CHAM_context_t *chamctxt,
 
     if ( CHAMELEON_TRANSLATION == ChamOutOfPlace ) {
         /* Initialize the tile descriptor */
-        chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, mb, nb, (mb)*(nb),
+        chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, mb, nb, (mb)*(nb),
                              lm, ln, 0, 0, m, n, 1, 1,
                              chameleon_getaddr_ccrb, chameleon_getblkldd_ccrb, NULL );
 
@@ -235,6 +235,7 @@ chameleon_ztile2lap( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t
 static inline void
 chameleon_ztile2lap_cleanup( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t *descAt )
 {
+    (void)chamctxt;
     chameleon_desc_destroy( descAl );
     chameleon_desc_destroy( descAt );
 }
diff --git a/control/descriptor.c b/control/descriptor.c
index c27fe5749..06e52cdec 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -226,26 +226,32 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
     /* The matrix is alocated tile by tile with out of core */
     desc->ooc = 0;
 
-    // Matrix address
-    if ( mat == CHAMELEON_MAT_ALLOC_GLOBAL ) {
-        rc = chameleon_desc_mat_alloc( desc );
+    switch ( (intptr_t)mat ) {
+    case (intptr_t)CHAMELEON_MAT_ALLOC_TILE:
+        if ( chamctxt->scheduler == RUNTIME_SCHED_STARPU ) {
+            /* Let's use the allocation on the fly as in OOC */
+            desc->get_blkaddr = chameleon_getaddr_null;
+            desc->mat = NULL;
+            break;
+        }
+        /* Otherwise we switch back to the full allocation */
 
-        desc->alloc_mat = 1;
-        desc->use_mat   = 1;
-    }
-    else if ( mat == CHAMELEON_MAT_ALLOC_TILE ) {
-        //chameleon_error( "chameleon_desc_init", "CHAMELEON_MAT_ALLOC_TILE is not available yet" );
-        //desc->mat = NULL;
+    case (intptr_t)CHAMELEON_MAT_ALLOC_GLOBAL:
         rc = chameleon_desc_mat_alloc( desc );
+        desc->alloc_mat = 1;
         desc->use_mat   = 1;
+        break;
 
-        desc->alloc_mat = 1;
-    }
-    else if ( mat == CHAMELEON_MAT_OOC ) {
+    case (intptr_t)CHAMELEON_MAT_OOC:
+        if ( chamctxt->scheduler != RUNTIME_SCHED_STARPU ) {
+            chameleon_error("CHAMELEON_Desc_Create", "CHAMELEON Out-of-Core descriptors are supported only with StarPU");
+            return CHAMELEON_ERR_NOT_SUPPORTED;
+        }
         desc->mat = NULL;
         desc->ooc = 1;
-    }
-    else {
+        break;
+
+    default:
         /* memory of the matrix is handled by users */
         desc->mat     = mat;
         desc->use_mat = 1;
diff --git a/control/workspace.c b/control/workspace.c
index e743e33db..8039447fb 100644
--- a/control/workspace.c
+++ b/control/workspace.c
@@ -74,7 +74,8 @@ int chameleon_alloc_ibnb_tile(int M, int N, cham_tasktype_t func, int type, CHAM
     lm = IB * MT;
     ln = NB * NT;
 
-    return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q );
+    return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
+                                  lm, ln, 0, 0, lm, ln, p, q );
 }
 
 /**
@@ -119,7 +120,8 @@ int chameleon_alloc_ipiv(int M, int N, cham_tasktype_t func, int type, CHAM_desc
     /* TODO: Fix the distribution for IPIV */
     *IPIV = (int*)malloc( size );
 
-    return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q );
+    return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
+                                  lm, ln, 0, 0, lm, ln, p, q );
 }
 
 /**
diff --git a/coreblas/compute/core_zgelqt.c b/coreblas/compute/core_zgelqt.c
index cb9f67b49..7a2a74ca0 100644
--- a/coreblas/compute/core_zgelqt.c
+++ b/coreblas/compute/core_zgelqt.c
@@ -67,7 +67,7 @@
  *         The leading dimension of the array A.  LDA >= max(1,M).
  *
  * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
+ *         The IB-by-M triangular factor T of the block reflector.
  *         T is upper triangular by block (economic storage);
  *         The rest of the array is not referenced.
  *
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index ca0ae0e21..4fa07c2b4 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -483,6 +483,8 @@ INSERT_TASK_ztsmlq( const RUNTIME_option_t *options,
                     const CHAM_desc_t *V, int Vm, int Vn, int ldv,
                     const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
+    (void)m1;
+    (void)n1;
     return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb,
                                 V, Vm, Vn, ldv, T, Tm, Tn, ldt,
                                 A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
@@ -497,6 +499,8 @@ INSERT_TASK_ztsmqr( const RUNTIME_option_t *options,
                     const CHAM_desc_t *V, int Vm, int Vn, int ldv,
                     const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
+    (void)m1;
+    (void)n1;
     return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb,
                                 V, Vm, Vn, ldv, T, Tm, Tn, ldt,
                                 A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
@@ -511,6 +515,8 @@ INSERT_TASK_zttmlq( const RUNTIME_option_t *options,
                     const CHAM_desc_t *V, int Vm, int Vn, int ldv,
                     const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
+    (void)m1;
+    (void)n1;
     return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb,
                                 V, Vm, Vn, ldv, T, Tm, Tn, ldt,
                                 A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
@@ -525,6 +531,8 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
                     const CHAM_desc_t *V, int Vm, int Vn, int ldv,
                     const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
+    (void)m1;
+    (void)n1;
     return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb,
                                 V, Vm, Vn, ldv, T, Tm, Tn, ldt,
                                 A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c
index 71a9bddce..3341a8f01 100644
--- a/runtime/openmp/codelets/codelet_zgelqt.c
+++ b/runtime/openmp/codelets/codelet_zgelqt.c
@@ -98,10 +98,13 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     int ws_size = options->ws_wsize;
-#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+
+#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
     {
       CHAMELEON_Complex64_t TAU[ws_size];
       CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
-      CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+
+      CORE_zlaset( ChamUpperLower, ib, m, 0., 0., ptrT, ldt );
+      CORE_zgelqt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
     }
 }
diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c
index a09763773..6428375b2 100644
--- a/runtime/openmp/codelets/codelet_zgeqrt.c
+++ b/runtime/openmp/codelets/codelet_zgeqrt.c
@@ -99,10 +99,13 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     int ws_size = options->ws_wsize;
-#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+
+#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
     {
       CHAMELEON_Complex64_t TAU[ws_size];
       CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
-      CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+
+      CORE_zlaset( ChamUpperLower, ib, n, 0., 0., ptrT, ldt );
+      CORE_zgeqrt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
     }
 }
diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c
index 1acb66066..4bb4f16f0 100644
--- a/runtime/openmp/codelets/codelet_ztplqt.c
+++ b/runtime/openmp/codelets/codelet_ztplqt.c
@@ -31,9 +31,13 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     int ws_size = options->ws_wsize;
-#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0])
+
+#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
     {
       CHAMELEON_Complex64_t work[ws_size];
+
+      CORE_zlaset( ChamUpperLower, ib, M, 0., 0., ptrT, ldt);
+
       CORE_ztplqt( M, N, L, ib,
                    ptrA, lda, ptrB, ldb, ptrT, ldt, work );
     }
diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c
index 17917cc7b..7381f6ebd 100644
--- a/runtime/openmp/codelets/codelet_ztpqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpqrt.c
@@ -30,9 +30,13 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     int ws_size = options->ws_wsize;
-#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+
+#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
     {
       CHAMELEON_Complex64_t tmp[ws_size];
+
+      CORE_zlaset( ChamUpperLower, ib, N, 0., 0., ptrT, ldt);
+
       CORE_ztpqrt( M, N, L, ib,
           ptrA, lda, ptrB, ldb, ptrT, ldt, tmp );
     }
diff --git a/runtime/parsec/codelets/codelet_zgelqt.c b/runtime/parsec/codelets/codelet_zgelqt.c
index 6e159eddc..4ef5b5b7a 100644
--- a/runtime/parsec/codelets/codelet_zgelqt.c
+++ b/runtime/parsec/codelets/codelet_zgelqt.c
@@ -98,6 +98,7 @@ CORE_zgelqt_parsec( parsec_execution_stream_t *context,
     parsec_dtd_unpack_args(
         this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
     CORE_zgelqt( m, n, ib, A, lda, T, ldt, TAU, WORK );
 
     (void)context;
diff --git a/runtime/parsec/codelets/codelet_zgeqrt.c b/runtime/parsec/codelets/codelet_zgeqrt.c
index d4e9cc529..53ac8ac04 100644
--- a/runtime/parsec/codelets/codelet_zgeqrt.c
+++ b/runtime/parsec/codelets/codelet_zgeqrt.c
@@ -99,6 +99,7 @@ CORE_zgeqrt_parsec ( parsec_execution_stream_t *context,
     parsec_dtd_unpack_args(
         this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
     CORE_zgeqrt( m, n, ib, A, lda, T, ldt, TAU, WORK );
 
     (void)context;
diff --git a/runtime/parsec/codelets/codelet_ztplqt.c b/runtime/parsec/codelets/codelet_ztplqt.c
index 3da524a42..96a220925 100644
--- a/runtime/parsec/codelets/codelet_ztplqt.c
+++ b/runtime/parsec/codelets/codelet_ztplqt.c
@@ -40,6 +40,7 @@ CORE_ztplqt_parsec( parsec_execution_stream_t *context,
     parsec_dtd_unpack_args(
         this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldt );
     CORE_ztplqt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 
diff --git a/runtime/parsec/codelets/codelet_ztpqrt.c b/runtime/parsec/codelets/codelet_ztpqrt.c
index ace7a3bf9..f2308aa5b 100644
--- a/runtime/parsec/codelets/codelet_ztpqrt.c
+++ b/runtime/parsec/codelets/codelet_ztpqrt.c
@@ -40,6 +40,7 @@ CORE_ztpqrt_parsec( parsec_execution_stream_t *context,
     parsec_dtd_unpack_args(
         this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
     CORE_ztpqrt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 
diff --git a/runtime/quark/codelets/codelet_zgelqt.c b/runtime/quark/codelets/codelet_zgelqt.c
index 7b1e5a47d..240773c98 100644
--- a/runtime/quark/codelets/codelet_zgelqt.c
+++ b/runtime/quark/codelets/codelet_zgelqt.c
@@ -40,6 +40,7 @@ void CORE_zgelqt_quark(Quark *quark)
     CHAMELEON_Complex64_t *WORK;
 
     quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK);
+    CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
     CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
 }
 
diff --git a/runtime/quark/codelets/codelet_zgeqrt.c b/runtime/quark/codelets/codelet_zgeqrt.c
index 010a24653..09ed24eef 100644
--- a/runtime/quark/codelets/codelet_zgeqrt.c
+++ b/runtime/quark/codelets/codelet_zgeqrt.c
@@ -40,6 +40,7 @@ void CORE_zgeqrt_quark(Quark *quark)
     CHAMELEON_Complex64_t *WORK;
 
     quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK);
+    CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
     CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
 }
 
diff --git a/runtime/quark/codelets/codelet_ztplqt.c b/runtime/quark/codelets/codelet_ztplqt.c
index f0e51b375..98b153433 100644
--- a/runtime/quark/codelets/codelet_ztplqt.c
+++ b/runtime/quark/codelets/codelet_ztplqt.c
@@ -39,6 +39,7 @@ CORE_ztplqt_quark( Quark *quark )
     quark_unpack_args_11( quark, M, N, L, ib,
                           A, lda, B, ldb, T, ldt, WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
     CORE_ztplqt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 }
diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c
index 24ce98e12..b508e548c 100644
--- a/runtime/quark/codelets/codelet_ztpqrt.c
+++ b/runtime/quark/codelets/codelet_ztpqrt.c
@@ -39,6 +39,7 @@ CORE_ztpqrt_quark( Quark *quark )
     quark_unpack_args_11( quark, M, N, L, ib,
                           A, lda, B, ldb, T, ldt, WORK );
 
+    CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
     CORE_ztpqrt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 }
diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index 68d435d03..8ffad6e1a 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -26,6 +26,36 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
+{
+    CHAMELEON_starpu_ws_t *h_work;
+    int m;
+    int n;
+    int ib;
+    CHAMELEON_Complex64_t *A;
+    int lda;
+    CHAMELEON_Complex64_t *T;
+    int ldt;
+    CHAMELEON_Complex64_t *TAU, *WORK;
+
+    A   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + ib*n */
+
+    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
+
+    WORK = TAU + chameleon_max( m, n );
+    CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
+    CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
+
 /**
  *
  * @ingroup INSERT_TASK_Complex64_t
@@ -87,7 +117,6 @@
  *          \retval <0 if -i, the i-th argument had an illegal value
  *
  */
-
 void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
                        int m, int n, int ib, int nb,
                        const CHAM_desc_t *A, int Am, int An, int lda,
@@ -123,33 +152,3 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
 #endif
         0);
 }
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
-{
-    CHAMELEON_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A;
-    int lda;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU, *WORK;
-
-    A   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + ib*n */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
-
-    WORK = TAU + chameleon_max( m, n );
-    CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 0719010b6..205da5e35 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -35,7 +35,7 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
                       cham_trans_t transA, cham_trans_t transB,
                       int m, int n, int k, int nb,
                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                                               const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                                                   const CHAM_desc_t *B, int Bm, int Bn, int ldb,
                       CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn, int ldc)
 {
     (void)nb;
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index eaa242637..bee5168f9 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -26,6 +26,37 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
+{
+    CHAMELEON_starpu_ws_t *h_work;
+    int m;
+    int n;
+    int ib;
+    CHAMELEON_Complex64_t *A;
+    int lda;
+    CHAMELEON_Complex64_t *T;
+    int ldt;
+    CHAMELEON_Complex64_t *TAU, *WORK;
+
+    A   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + n * ib */
+
+    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
+
+    WORK = TAU + chameleon_max( m, n );
+
+    CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
+    CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
+
 /**
  *
  * @ingroup INSERT_TASK_Complex64_t
@@ -88,7 +119,6 @@
  *          \retval <0 if -i, the i-th argument had an illegal value
  *
  */
-
 void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
                        int m, int n, int ib, int nb,
                        const CHAM_desc_t *A, int Am, int An, int lda,
@@ -124,33 +154,3 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
 #endif
         0);
 }
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
-{
-    CHAMELEON_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A;
-    int lda;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU, *WORK;
-
-    A   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + n * ib */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
-
-    WORK = TAU + chameleon_max( m, n );
-    CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zlange.c b/runtime/starpu/codelets/codelet_zlange.c
index f689d82bc..9ab611908 100644
--- a/runtime/starpu/codelets/codelet_zlange.c
+++ b/runtime/starpu/codelets/codelet_zlange.c
@@ -24,10 +24,10 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-void INSERT_TASK_zlange(const RUNTIME_option_t *options,
-                       cham_normtype_t norm, int M, int N, int NB,
-                       const CHAM_desc_t *A, int Am, int An, int LDA,
-                       const CHAM_desc_t *B, int Bm, int Bn)
+void INSERT_TASK_zlange( const RUNTIME_option_t *options,
+                         cham_normtype_t norm, int M, int N, int NB,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn )
 {
     (void)NB;
     struct starpu_codelet *codelet = &cl_zlange;
diff --git a/runtime/starpu/codelets/codelet_ztplqt.c b/runtime/starpu/codelets/codelet_ztplqt.c
index c2f771e69..44615d5c3 100644
--- a/runtime/starpu/codelets/codelet_ztplqt.c
+++ b/runtime/starpu/codelets/codelet_ztplqt.c
@@ -43,6 +43,7 @@ static void cl_ztplqt_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
                                 &lda, &ldb, &ldt );
 
+    CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldt );
     CORE_ztplqt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 }
diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c
index bfddf9d4b..6fbd0afe6 100644
--- a/runtime/starpu/codelets/codelet_ztpqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpqrt.c
@@ -43,6 +43,7 @@ static void cl_ztpqrt_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
                                 &lda, &ldb, &ldt );
 
+    CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
     CORE_ztpqrt( M, N, L, ib,
                  A, lda, B, ldb, T, ldt, WORK );
 }
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 92b63ce46..c8ffd2e6b 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -238,7 +238,7 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc )
             for (m = 0; m < lmt; m++)
             {
                 if (*handle != NULL) {
-                    starpu_data_unregister(*handle);
+                    starpu_data_unregister_submit(*handle);
                 }
                 handle++;
             }
diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c
index a7a308326..8c833bd18 100644
--- a/runtime/starpu/control/runtime_options.c
+++ b/runtime/starpu/control/runtime_options.c
@@ -49,9 +49,9 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz
     int ret = 0;
     if ( worker_size > 0 ) {
         options->ws_wsize = worker_size;
-        starpu_vector_data_register((starpu_data_handle_t*)(&(options->ws_worker)),
-                                    -1, (uintptr_t)NULL,
-                                    worker_size, sizeof(char));
+        starpu_matrix_data_register( (starpu_data_handle_t*)(&(options->ws_worker)),
+                                     -1, (uintptr_t)NULL,
+                                     worker_size, worker_size, 1, sizeof(char));
     }
     if ( host_size > 0 ) {
         options->ws_hsize = host_size;
diff --git a/testing/testing_zgels.c b/testing/testing_zgels.c
index 9abcde8cf..6316ab1f5 100644
--- a/testing/testing_zgels.c
+++ b/testing/testing_zgels.c
@@ -103,7 +103,6 @@ int testing_zgels(int argc, char **argv)
     }
 
     CHAMELEON_Alloc_Workspace_zgels(M, N, &T, 1, 1);
-    memset(T->mat, 0, (T->llm*T->lln)*sizeof(CHAMELEON_Complex64_t));
     eps = LAPACKE_dlamch_work('e');
 
     /*----------------------------------------------------------
diff --git a/testing/testing_zgels_hqr.c b/testing/testing_zgels_hqr.c
index 67101b034..91b6d78d2 100644
--- a/testing/testing_zgels_hqr.c
+++ b/testing/testing_zgels_hqr.c
@@ -99,8 +99,6 @@ int testing_zgels_hqr(int argc, char **argv)
 
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TS, 1, 1);
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TT, 1, 1);
-    memset(TS->mat, 0, (TS->llm*TS->lln)*sizeof(CHAMELEON_Complex64_t));
-    memset(TT->mat, 0, (TT->llm*TT->lln)*sizeof(CHAMELEON_Complex64_t));
 
     eps = LAPACKE_dlamch_work( 'e' );
 
diff --git a/testing/testing_zgels_systolic.c b/testing/testing_zgels_systolic.c
index 53176ceec..7862ee0fd 100644
--- a/testing/testing_zgels_systolic.c
+++ b/testing/testing_zgels_systolic.c
@@ -93,8 +93,6 @@ int testing_zgels_systolic(int argc, char **argv)
 
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TS, 1, 1);
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TT, 1, 1);
-    memset(TS->mat, 0, (TS->llm*TS->lln)*sizeof(CHAMELEON_Complex64_t));
-    memset(TT->mat, 0, (TT->llm*TT->lln)*sizeof(CHAMELEON_Complex64_t));
 
     eps = LAPACKE_dlamch_work( 'e' );
 
diff --git a/timing/time_zgelqf.c b/timing/time_zgelqf.c
index e2c709b70..45c69f046 100644
--- a/timing/time_zgelqf.c
+++ b/timing/time_zgelqf.c
@@ -44,7 +44,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels(M, N, &T, P, Q);
-    memset(T->mat, 0, (T->llm*T->lln)*sizeof(ChamComplexDouble));
 
     /* Save AT in lapack layout for check */
     PASTE_CODE_ALLOCATE_COPY( Acpy, check, CHAMELEON_Complex64_t, A, LDA, N );
diff --git a/timing/time_zgelqf_tile.c b/timing/time_zgelqf_tile.c
index f79ee5a85..bc4723baf 100644
--- a/timing/time_zgelqf_tile.c
+++ b/timing/time_zgelqf_tile.c
@@ -45,7 +45,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
-    memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(ChamComplexDouble));
 
     /* CHAMELEON ZGEQRF */
     START_TIMING();
diff --git a/timing/time_zgels.c b/timing/time_zgels.c
index 77bbbe667..30a3ad5e1 100644
--- a/timing/time_zgels.c
+++ b/timing/time_zgels.c
@@ -26,7 +26,7 @@
 #include "timing_zauxiliary.h"
 
 static int
-RunTest(int *iparam, double *dparam, chameleon_time_t *t_) 
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 {
     CHAM_desc_t *T;
     PASTE_CODE_IPARAM_LOCALS( iparam );
@@ -47,7 +47,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
     CHAMELEON_zplrnt( M, NRHS, x, LDB, 5673 );
 
     CHAMELEON_Alloc_Workspace_zgels(M, N, &T, P, Q);
-    memset(T->mat, 0, (T->llm*T->lln)*sizeof(ChamComplexDouble));
 
     /* Save A and b  */
     if (check) {
@@ -58,13 +57,13 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
     START_TIMING();
     CHAMELEON_zgels( ChamNoTrans, M, N, NRHS, A, LDA, T, x, LDB );
     STOP_TIMING();
-    
+
     /* Check the solution */
     if (check)
     {
         dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, Acpy, LDA, b, x, LDB,
-                                              &(dparam[IPARAM_ANORM]), 
-                                              &(dparam[IPARAM_BNORM]), 
+                                              &(dparam[IPARAM_ANORM]),
+                                              &(dparam[IPARAM_BNORM]),
                                               &(dparam[IPARAM_XNORM]));
         free(Acpy); free(b);
     }
diff --git a/timing/time_zgels_tile.c b/timing/time_zgels_tile.c
index 6e0d300fa..0d628287b 100644
--- a/timing/time_zgels_tile.c
+++ b/timing/time_zgels_tile.c
@@ -25,7 +25,7 @@
 #include "./timing.c"
 
 static int
-RunTest(int *iparam, double *dparam, chameleon_time_t *t_) 
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 {
     CHAM_desc_t *descT;
     PASTE_CODE_IPARAM_LOCALS( iparam );
@@ -46,7 +46,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
-    memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(ChamComplexDouble));
 
     /* Save A and B for check */
     if (check == 1){
diff --git a/timing/time_zgeqrf.c b/timing/time_zgeqrf.c
index 70353b2ca..89e3534e4 100644
--- a/timing/time_zgeqrf.c
+++ b/timing/time_zgeqrf.c
@@ -44,7 +44,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels(M, N, &T, P, Q);
-    memset(T->mat, 0, (T->llm*T->lln)*sizeof(ChamComplexDouble));
 
     /* Save AT in lapack layout for check */
     PASTE_CODE_ALLOCATE_COPY( Acpy, check, CHAMELEON_Complex64_t, A, LDA, N );
diff --git a/timing/time_zgeqrf_hqr.c b/timing/time_zgeqrf_hqr.c
index 725597fa0..6b4f60459 100644
--- a/timing/time_zgeqrf_hqr.c
+++ b/timing/time_zgeqrf_hqr.c
@@ -51,9 +51,7 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TS, P, Q);
-    memset(TS->mat, 0, (TS->llm*TS->lln)*sizeof(ChamComplexDouble));
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TT, P, Q);
-    memset(TT->mat, 0, (TT->llm*TT->lln)*sizeof(ChamComplexDouble));
 
     /* Save AT in lapack layout for check */
     PASTE_CODE_ALLOCATE_COPY( Acpy, check, CHAMELEON_Complex64_t, A, LDA, N );
diff --git a/timing/time_zgeqrf_hqr_tile.c b/timing/time_zgeqrf_hqr_tile.c
index 2b30953e0..3af4530fd 100644
--- a/timing/time_zgeqrf_hqr_tile.c
+++ b/timing/time_zgeqrf_hqr_tile.c
@@ -58,9 +58,7 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TS, P, Q);
-    memset(TS->mat, 0, (TS->llm*TS->lln)*sizeof(ChamComplexDouble));
     CHAMELEON_Alloc_Workspace_zgels(M, N, &TT, P, Q);
-    memset(TT->mat, 0, (TT->llm*TT->lln)*sizeof(ChamComplexDouble));
 
     /* Initialize matrix */
     matrix.mt = TS->mt;
diff --git a/timing/time_zgeqrf_tile.c b/timing/time_zgeqrf_tile.c
index b35782a69..dc257558b 100644
--- a/timing/time_zgeqrf_tile.c
+++ b/timing/time_zgeqrf_tile.c
@@ -45,7 +45,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
-    memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(ChamComplexDouble));
 
     /* CHAMELEON ZGEQRF */
     START_TIMING();
diff --git a/timing/time_zgeqrs_tile.c b/timing/time_zgeqrs_tile.c
index 3018a74b2..78b008c1a 100644
--- a/timing/time_zgeqrs_tile.c
+++ b/timing/time_zgeqrs_tile.c
@@ -48,7 +48,6 @@ RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
 
     /* Allocate Workspace */
     CHAMELEON_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
-    memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(ChamComplexDouble));
 
     /* CHAMELEON ZGEQRF */
     CHAMELEON_zgeqrf_Tile( descA, descT );
-- 
GitLab