diff --git a/CMakeLists.txt b/CMakeLists.txt
index 770b404d86abd6731d769cfbfb0b75adb59f6aff..8dbc5f4b3fda1685968154087aa656d7ea21f88d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,11 @@ if (CHAMELEON_RUNTIME_SYNC)
     message("-- ${BoldGreen}CHAMELEON_RUNTIME_SYNC is set to ON, turn it OFF to avoid synchronisation in the tasks submission${ColourReset}")
 endif()
 
+option(CHAMELEON_KERNELS_TRACE "Enable kernel traces to debug the task execution order" OFF)
+if (CHAMELEON_KERNELS_TRACE)
+    message("-- ${BoldGreen}CHAMELEON_KERNELS_TRACE is set to ON, turn it OFF to get better perfomance${ColourReset}")
+endif()
+
 # Options to enable/disable doc, examples, and testings
 # -----------------------------------------------------
 option(CHAMELEON_ENABLE_DOC      "Enable documentation build"  OFF)
diff --git a/cmake_modules/gitlab-ci-initial-cache.cmake b/cmake_modules/gitlab-ci-initial-cache.cmake
index 3f2d2bd71bdec0bfb64208e1d09f510713ca7b5c..87cf140a8cf31dfc1fec203440e99057f20f3796 100644
--- a/cmake_modules/gitlab-ci-initial-cache.cmake
+++ b/cmake_modules/gitlab-ci-initial-cache.cmake
@@ -2,6 +2,8 @@ set(BUILD_SHARED_LIBS "ON" CACHE BOOL "")
 set(CMAKE_INSTALL_PREFIX "$ENV{PWD}/install" CACHE PATH "")
 set(CMAKE_VERBOSE_MAKEFILE "ON" CACHE BOOL "")
 
+set(CMAKE_C_FLAGS "-Werror")
+
 option(MORSE_ENABLE_WARNING  "Enable warning messages"        ON)
 option(MORSE_ENABLE_COVERAGE "Enable flags for coverage test" ON)
 
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index db91b1d3b5cfed66eafc42fdcc0e16049fbaea4d..d4b97797a312ddfb6ca28c31ebde693afaba14cd 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -36,6 +36,7 @@ set(CHAMELEON_CONTROL
     ../control/context.c
     ../control/control.c
     ../control/descriptor.c
+    ../control/descriptor_rec.c
     ../control/workspace.c
     ../control/tile.c
     ../control/chameleon_f77.c
diff --git a/control/descriptor.c b/control/descriptor.c
index b26b7c7f2143effaa6357f25639e433bb65c97d2..5af45af7fa0ed24516a3d4835c20a385fe320de9 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -26,7 +26,9 @@
  * @brief Group descriptor routines exposed to users
  *
  */
+#define _GNU_SOURCE 1
 #include <stdlib.h>
+#include <stdio.h>
 #include <assert.h>
 #include <string.h>
 #include "control/common.h"
@@ -70,6 +72,17 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc )
     }
 
     if ( desc->tiles ) {
+#if defined(CHAMELEON_KERNELS_TRACE)
+        CHAM_tile_t *tile = desc->tiles;
+        int ii, jj;
+        for( jj=0; jj<desc->lnt; jj++ ) {
+            for( ii=0; ii<desc->lmt; ii++, tile++ ) {
+                if ( tile->name ) {
+                    free( tile->name );
+                }
+            }
+        }
+#endif
         free( desc->tiles );
     }
     return CHAMELEON_SUCCESS;
@@ -91,6 +104,9 @@ void chameleon_desc_init_tiles( CHAM_desc_t *desc )
             tile->n   = jj == desc->lnt-1 ? desc->ln - jj * desc->nb : desc->nb;
             tile->mat = (rank == desc->myrank) ? desc->get_blkaddr( desc, ii, jj ) : NULL;
             tile->ld  = desc->get_blkldd( desc, ii );
+#if defined(CHAMELEON_KERNELS_TRACE)
+            asprintf( &(tile->name), "%s(%d,%d)", desc->name, ii, jj);
+#endif
         }
     }
 }
@@ -194,38 +210,45 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
 
     memset( desc, 0, sizeof(CHAM_desc_t) );
 
+    assert( i == 0 );
+    assert( j == 0 );
+    assert( bsiz == (mb * nb) );
+
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
         chameleon_error("CHAMELEON_Desc_Create", "CHAMELEON not initialized");
         return CHAMELEON_ERR_NOT_INITIALIZED;
     }
 
-    // If one of the function get_* is NULL, we switch back to the default, like in chameleon_desc_init()
+    /* If one of the function get_* is NULL, we switch back to the default */
     desc->get_blktile = chameleon_desc_gettile;
     desc->get_blkaddr = get_blkaddr ? get_blkaddr : chameleon_getaddr_ccrb;
     desc->get_blkldd  = get_blkldd  ? get_blkldd  : chameleon_getblkldd_ccrb;
     desc->get_rankof  = get_rankof  ? get_rankof  : chameleon_getrankof_2d;
-    // Matrix properties
+
+    /* Matrix properties */
     desc->dtyp = dtyp;
-    // Should be given as parameter to follow get_blkaddr (unused)
-    desc->styp = ChamCCRB;
+    /* Should be given as parameter to follow get_blkaddr (unused) */
+    desc->styp = (get_blkaddr == chameleon_getaddr_cm ) ? ChamCM : ChamCCRB;
     desc->mb   = mb;
     desc->nb   = nb;
-    desc->bsiz = bsiz;
-    // Large matrix parameters
-    desc->lm = lm;
-    desc->ln = ln;
-    // Large matrix derived parameters
-    desc->lmt = (lm%mb==0) ? (lm/mb) : (lm/mb+1);
-    desc->lnt = (ln%nb==0) ? (ln/nb) : (ln/nb+1);
-    // Submatrix parameters
-    desc->i = i;
-    desc->j = j;
+    desc->bsiz = mb * nb;
+
+    /* Matrix parameters */
+    desc->i = 0;
+    desc->j = 0;
     desc->m = m;
     desc->n = n;
-    // Submatrix derived parameters
-    desc->mt = (m == 0) ? 0 : (i+m-1)/mb - i/mb + 1;
-    desc->nt = (n == 0) ? 0 : (j+n-1)/nb - j/nb + 1;
+
+    /* Matrix stride parameters */
+    desc->lm = m;
+    desc->ln = n;
+
+    /* Matrix derived parameters */
+    desc->mt  = chameleon_ceil( m, mb );
+    desc->nt  = chameleon_ceil( n, nb );
+    desc->lmt = desc->mt;
+    desc->lnt = desc->nt;
 
     desc->id = nbdesc;
     nbdesc++;
@@ -233,14 +256,20 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
 
     desc->myrank = RUNTIME_comm_rank( chamctxt );
 
-    // Grid size
+    /* Grid size */
     desc->p = p;
     desc->q = q;
 
-    // Local dimensions in tiles
+    /* Local dimensions in tiles */
     if ( desc->myrank < (p*q) ) {
-        desc->llmt = (desc->lmt + p - 1) / p;
-        desc->llnt = (desc->lnt + q - 1) / q;
+        int gmt, gnt;
+
+        /* Compute the fictive full number of tiles to derivate the local leading dimension */
+        gmt = chameleon_ceil( lm, mb );
+        gnt = chameleon_ceil( ln, nb );
+
+        desc->llmt = chameleon_ceil( gmt, p );
+        desc->llnt = chameleon_ceil( gnt, q );
 
         // Local dimensions
         if ( ((desc->lmt-1) % p) == (desc->myrank / q) ) {
@@ -255,8 +284,8 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
             desc->lln  =  desc->llnt * nb;
         }
 
-        desc->llm1 = (desc->llm/mb);
-        desc->lln1 = (desc->lln/nb);
+        desc->llm1 = desc->llm / mb;
+        desc->lln1 = desc->lln / nb;
     } else {
         desc->llmt = 0;
         desc->llnt = 0;
@@ -326,13 +355,13 @@ CHAM_desc_t* chameleon_desc_submatrix( CHAM_desc_t *descA, int i, int j, int m,
     CHAM_desc_t *descB = malloc(sizeof(CHAM_desc_t));
     int mb, nb;
 
-    if ( (descA->i + i + m) > descA->lm ) {
+    if ( (descA->i + i + m) > descA->m ) {
         chameleon_error("chameleon_desc_submatrix", "The number of rows (i+m) of the submatrix doesn't fit in the parent matrix");
-        assert((descA->i + i + m) > descA->lm);
+        assert((descA->i + i + m) > descA->m);
     }
-    if ( (descA->j + j + n) > descA->ln ) {
+    if ( (descA->j + j + n) > descA->n ) {
         chameleon_error("chameleon_desc_submatrix", "The number of rows (j+n) of the submatrix doesn't fit in the parent matrix");
-        assert((descA->j + j + n) > descA->ln);
+        assert((descA->j + j + n) > descA->n);
     }
 
     memcpy( descB, descA, sizeof(CHAM_desc_t) );
@@ -825,9 +854,11 @@ CHAM_desc_t *CHAMELEON_Desc_CopyOnZero( const CHAM_desc_t *descin, void *mat )
  * @retval CHAMELEON_SUCCESS successful exit
  *
  */
-int CHAMELEON_Desc_Destroy(CHAM_desc_t **desc)
+int CHAMELEON_Desc_Destroy(CHAM_desc_t **descptr)
 {
     CHAM_context_t *chamctxt;
+    CHAM_desc_t *desc;
+    int m, n;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -835,14 +866,34 @@ int CHAMELEON_Desc_Destroy(CHAM_desc_t **desc)
         return CHAMELEON_ERR_NOT_INITIALIZED;
     }
 
-    if (*desc == NULL) {
+    if ((descptr == NULL) || (*descptr == NULL)) {
         chameleon_error("CHAMELEON_Desc_Destroy", "attempting to destroy a NULL descriptor");
         return CHAMELEON_ERR_UNALLOCATED;
     }
 
-    chameleon_desc_destroy( *desc );
-    free(*desc);
-    *desc = NULL;
+    desc = *descptr;
+    for ( n=0; n<desc->nt; n++ ) {
+        for ( m=0; m<desc->mt; m++ ) {
+            CHAM_tile_t *tile;
+
+            tile = desc->get_blktile( desc, m, n );
+
+            if ( tile->format == CHAMELEON_TILE_DESC ) {
+                CHAM_desc_t *tiledesc = tile->mat;
+
+                /* Recursive names are allocated internally, we need to free them */
+                if ( tiledesc->name ) {
+                    free( (void*)(tiledesc->name) );
+                }
+                CHAMELEON_Desc_Destroy( &tiledesc );
+                assert( tiledesc == NULL );
+            }
+        }
+    }
+
+    chameleon_desc_destroy( desc );
+    free(desc);
+    *descptr = NULL;
     return CHAMELEON_SUCCESS;
 }
 
@@ -940,3 +991,48 @@ void CHAMELEON_user_tag_size(int user_tag_width, int user_tag_sep) {
     RUNTIME_comm_set_tag_sizes( user_tag_width, user_tag_sep );
     return;
 }
+
+static void
+chameleon_desc_print( const CHAM_desc_t *desc, int shift )
+{
+    intptr_t base = (intptr_t)desc->mat;
+    int m, n;
+
+    for ( n=0; n<desc->nt; n++ ) {
+        for ( m=0; m<desc->mt; m++ ) {
+            const CHAM_tile_t *tile;
+            const CHAM_desc_t *tiledesc;
+            intptr_t ptr;
+
+            tile     = desc->get_blktile( desc, m, n );
+            tiledesc = tile->mat;
+
+            ptr = ( tile->format == CHAMELEON_TILE_DESC ) ? (intptr_t)(tiledesc->mat) : (intptr_t)(tile->mat);
+
+            fprintf( stdout, "%*s%s(%3d,%3d): %d * %d / ld = %d / offset= %ld\n",
+                     shift, " ", desc->name, m, n, tile->m, tile->n, tile->ld, ptr - base );
+
+            if ( tile->format == CHAMELEON_TILE_DESC ) {
+                chameleon_desc_print( tiledesc, shift+2 );
+            }
+        }
+    }
+}
+
+/**
+ *****************************************************************************
+ *
+ * @ingroup Descriptor
+ *
+ *  @brief Print descriptor structure for debug purpose
+ *
+ ******************************************************************************
+ *
+ * @param[in] desc
+ *          The input desc for which to describe to print the tile structure
+ */
+void
+CHAMELEON_Desc_Print( const CHAM_desc_t *desc )
+{
+    chameleon_desc_print( desc, 2 );
+}
diff --git a/control/descriptor_rec.c b/control/descriptor_rec.c
new file mode 100644
index 0000000000000000000000000000000000000000..6820b46ed2495d18125eca2ab0f05239665326d8
--- /dev/null
+++ b/control/descriptor_rec.c
@@ -0,0 +1,102 @@
+/**
+ *
+ * @file descriptor_rec.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon descriptors routines
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @author Gwenole Lucas
+ * @date 2020-03-03
+ *
+ */
+#define _GNU_SOURCE 1
+#include "control/common.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "chameleon/runtime.h"
+
+static int
+chameleon_recdesc_create( const char *name, CHAM_desc_t **descptr, void *mat, cham_flttype_t dtyp,
+                          int *mb, int *nb,
+                          int lm, int ln, int m, int n, int p, int q,
+                          blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd, blkrankof_fct_t get_rankof )
+{
+    CHAM_desc_t *desc;
+    int rc;
+
+    /* Let's make sure we have at least one couple (mb, nb) defined */
+    assert( (mb[0] > 0) && (nb[0] > 0) );
+
+    /* Create the current layer descriptor */
+    rc = CHAMELEON_Desc_Create_User( descptr, mat, dtyp, mb[0], nb[0], mb[0] * nb[0],
+                                     lm, ln, 0, 0, m, n, 1, 1,
+                                     get_blkaddr, get_blkldd, get_rankof );
+    desc = *descptr;
+    desc->name = name;
+
+    if ( rc != CHAMELEON_SUCCESS ) {
+        return rc;
+    }
+
+    /* Move to the next tile size to recurse */
+    mb++;
+    nb++;
+    if ( (mb[0] <= 0) || (nb[0] <= 0) ) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    for ( n=0; n<desc->nt; n++ ) {
+        for ( m=0; m<desc->mt; m++ ) {
+            CHAM_desc_t *tiledesc;
+            CHAM_tile_t *tile;
+            int tempmm, tempnn;
+            char *subname;
+
+            tile = desc->get_blktile( desc, m, n );
+            tempmm = m == desc->mt-1 ? desc->m - m * desc->mb : desc->mb;
+            tempnn = n == desc->nt-1 ? desc->n - n * desc->nb : desc->nb;
+            asprintf( &subname, "%s[%d,%d]", name, m, n );
+
+            chameleon_recdesc_create( subname, &tiledesc, tile->mat,
+                                      desc->dtyp, mb, nb,
+                                      tile->ld, tempnn, /* Abuse as ln is not used */
+                                      tempmm, tempnn,
+                                      1, 1,             /* can recurse only on local data */
+                                      chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL);
+
+            tile->format = CHAMELEON_TILE_DESC;
+            tile->mat = tiledesc;
+
+            if ( rc != CHAMELEON_SUCCESS ) {
+                return rc;
+            }
+        }
+    }
+
+    return CHAMELEON_SUCCESS;
+}
+
+int
+CHAMELEON_Recursive_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flttype_t dtyp,
+                                 int *mb, int *nb, int lm, int ln, int m, int n, int p, int q,
+                                 blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd, blkrankof_fct_t get_rankof )
+{
+    /*
+     * The first layer must be allocated, otherwise we will give unitialized
+     * pointers to the lower layers
+     */
+    assert( (mat != CHAMELEON_MAT_ALLOC_TILE) &&
+            (mat != CHAMELEON_MAT_OOC) );
+
+    return chameleon_recdesc_create( "A", descptr, mat, dtyp,
+                                     mb, nb, lm, ln, m, n, p, q,
+                                     get_blkaddr, get_blkldd, get_rankof );
+}
diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c
index 3eb9cffc00d6da2e2e156280be218449791a5883..6c28b72d322c84a18b14cfbd387328d9c48652c7 100644
--- a/coreblas/compute/core_ztile.c
+++ b/coreblas/compute/core_ztile.c
@@ -28,6 +28,7 @@ TCORE_dlag2z( cham_uplo_t uplo, int M, int N,
               const CHAM_tile_t *A,
               CHAM_tile_t       *B )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & CHAMELEON_TILE_FULLRANK );
     assert( B->format & CHAMELEON_TILE_FULLRANK );
     CORE_dlag2z( uplo, M, N, A->mat, A->ld, B->mat, B->ld );
@@ -42,7 +43,8 @@ TCORE_dzasum( cham_store_t       storev,
               const CHAM_tile_t *A,
               double *           work )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_dzasum( storev, uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, work );
 }
 
@@ -54,8 +56,9 @@ TCORE_zaxpy( int                   M,
              CHAM_tile_t *         B,
              int                   incB )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zaxpy( M, alpha, CHAM_tile_get_ptr( A ), incA, CHAM_tile_get_ptr( B ), incB );
 }
 
@@ -68,6 +71,7 @@ TCORE_zgeadd( cham_trans_t          trans,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
     if ( (A->format & CHAMELEON_TILE_DESC) &&
          (B->format & CHAMELEON_TILE_DESC) )
     {
@@ -88,8 +92,9 @@ TCORE_zgelqt( int                    M,
               CHAMELEON_Complex64_t *TAU,
               CHAMELEON_Complex64_t *WORK )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, T );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgelqt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK );
 }
 
@@ -99,6 +104,7 @@ TCORE_zgemv( cham_trans_t trans, int M, int N,
                                           const CHAM_tile_t *x, int incX,
              CHAMELEON_Complex64_t beta,        CHAM_tile_t *y, int incY )
 {
+    coreblas_kernel_trace( A, x, y );
     assert( A->format & CHAMELEON_TILE_FULLRANK );
     assert( x->format & CHAMELEON_TILE_FULLRANK );
     assert( y->format & CHAMELEON_TILE_FULLRANK );
@@ -118,9 +124,10 @@ TCORE_zgemm( cham_trans_t          transA,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
-    if ( ( A->format & CHAMELEON_TILE_FULLRANK ) &&
-         ( B->format & CHAMELEON_TILE_FULLRANK ) &&
-         ( C->format & CHAMELEON_TILE_FULLRANK ) )
+    coreblas_kernel_trace( A, B, C );
+    if ( ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
+         ( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
+         ( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) )
     {
         CORE_zgemm( transA, transB, M, N, K, alpha,
                     CHAM_tile_get_ptr( A ), A->ld,
@@ -164,32 +171,37 @@ TCORE_zgeqrt( int                    M,
               CHAMELEON_Complex64_t *TAU,
               CHAMELEON_Complex64_t *WORK )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, T );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgeqrt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK );
 }
 
 int
 TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A )
 {
-    assert( L->format & CHAMELEON_TILE_FULLRANK );
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( L, A );
+    assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgessm( M, N, K, IB, IPIV, CHAM_tile_get_ptr( L ), L->ld, CHAM_tile_get_ptr( A ), A->ld );
 }
 
 int
 TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( sclssq->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, sclssq );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgessq( storev, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
 }
 
 int
 TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO )
 {
+    coreblas_kernel_trace( A );
+
     int rc = -1;
-    if ( A->format & CHAMELEON_TILE_FULLRANK ) {
+    if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         rc = CORE_zgetrf( M, N, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO );
     }
 #if defined( CHAMELEON_USE_HMAT )
@@ -207,7 +219,9 @@ TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO )
 int
 TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO )
 {
-    if ( A->format & CHAMELEON_TILE_FULLRANK ) {
+    coreblas_kernel_trace( A );
+
+    if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         return CORE_zgetrf_incpiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO );
     }
 #if defined( CHAMELEON_USE_HMAT )
@@ -224,10 +238,12 @@ TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO
 int
 TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO )
 {
+    coreblas_kernel_trace( A );
+
     int rc = -1;
     *INFO  = 0;
 
-    if ( A->format & CHAMELEON_TILE_FULLRANK ) {
+    if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         rc = CORE_zgetrf_nopiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, INFO );
     }
 #if defined( CHAMELEON_USE_HMAT )
@@ -245,8 +261,9 @@ TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO )
 void
 TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zhe2ge( uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
 }
 
@@ -262,9 +279,9 @@ TCORE_zhemm( cham_side_t           side,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zhemm( side, uplo, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 
@@ -278,8 +295,9 @@ TCORE_zherk( cham_uplo_t        uplo,
              double             beta,
              CHAM_tile_t *      C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zherk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 
@@ -294,9 +312,10 @@ TCORE_zher2k( cham_uplo_t           uplo,
               double                beta,
               CHAM_tile_t *         C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zher2k( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 #endif
@@ -313,9 +332,10 @@ TCORE_zherfb( cham_uplo_t            uplo,
               CHAMELEON_Complex64_t *WORK,
               int                    ldwork )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, T, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zherfb(
         uplo, N, K, IB, NB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, ldwork );
 }
@@ -328,8 +348,9 @@ TCORE_zhessq( cham_store_t       storev,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( sclssq->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, sclssq );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zhessq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
 }
 #endif
@@ -353,7 +374,8 @@ TCORE_zlange( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlange( norm, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
 
@@ -366,7 +388,8 @@ TCORE_zlanhe( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlanhe( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
 #endif
@@ -379,7 +402,8 @@ TCORE_zlansy( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlansy( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
 
@@ -393,14 +417,16 @@ TCORE_zlantr( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlantr( norm, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
 
 int
 TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zlascal( uplo, m, n, alpha, CHAM_tile_get_ptr( A ), A->ld );
 }
 
@@ -412,14 +438,16 @@ TCORE_zlaset( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         A )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlaset( uplo, n1, n2, alpha, beta, CHAM_tile_get_ptr( A ), A->ld );
 }
 
 void
 TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlaset2( uplo, n1, n2, alpha, CHAM_tile_get_ptr( A ), A->ld );
 }
 
@@ -431,15 +459,17 @@ TCORE_zlatro( cham_uplo_t        uplo,
               const CHAM_tile_t *A,
               CHAM_tile_t *      B )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zlatro( uplo, trans, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
 }
 
 void
 TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlauum( uplo, N, CHAM_tile_get_ptr( A ), A->ld );
 }
 
@@ -448,14 +478,14 @@ void
 TCORE_zplghe( double                 bump,
               int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
-    CORE_zplghe( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplghe( bump, m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 #endif
 
@@ -463,33 +493,36 @@ void
 TCORE_zplgsy( CHAMELEON_Complex64_t  bump,
               int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
-    CORE_zplgsy( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplgsy( bump, m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 
 void
 TCORE_zplrnt( int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
-    CORE_zplrnt( m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplrnt( m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 
 void
 TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO )
 {
-    if ( A->format & CHAMELEON_TILE_FULLRANK ) {
+    coreblas_kernel_trace( A );
+    if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         CORE_zpotrf( uplo, n, CHAM_tile_get_ptr( A ), A->ld, INFO );
     }
 #if defined( CHAMELEON_USE_HMAT )
@@ -517,10 +550,11 @@ TCORE_zssssm( int                M1,
               const CHAM_tile_t *L2,
               const int *        IPIV )
 {
-    assert( A1->format & CHAMELEON_TILE_FULLRANK );
-    assert( A2->format & CHAMELEON_TILE_FULLRANK );
-    assert( L1->format & CHAMELEON_TILE_FULLRANK );
-    assert( L2->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A1, A2, L1, L2 );
+    assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( L1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( L2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zssssm( M1,
                         N1,
                         M2,
@@ -549,9 +583,10 @@ TCORE_zsymm( cham_side_t           side,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zsymm( side, uplo, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 
@@ -565,8 +600,9 @@ TCORE_zsyrk( cham_uplo_t           uplo,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zsyrk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 
@@ -581,9 +617,10 @@ TCORE_zsyr2k( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         C )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B, C );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zsyr2k( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
 }
 
@@ -594,8 +631,9 @@ TCORE_zsyssq( cham_store_t       storev,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( sclssq->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, sclssq );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zsyssq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
 }
 
@@ -603,7 +641,8 @@ TCORE_zsyssq( cham_store_t       storev,
 int
 TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zsytf2_nopiv( uplo, n, CHAM_tile_get_ptr( A ), A->ld );
 }
 #endif
@@ -618,9 +657,10 @@ TCORE_ztplqt( int                    M,
               CHAM_tile_t *          T,
               CHAMELEON_Complex64_t *WORK )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B, T );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztplqt( M, N, L, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, CHAM_tile_get_ptr( T ), T->ld, WORK );
 }
 
@@ -638,10 +678,11 @@ TCORE_ztpmlqt( cham_side_t            side,
                CHAM_tile_t *          B,
                CHAMELEON_Complex64_t *WORK )
 {
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( V, T, A, B );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztpmlqt( side,
                          trans,
                          M,
@@ -674,10 +715,11 @@ TCORE_ztpmqrt( cham_side_t            side,
                CHAM_tile_t *          B,
                CHAMELEON_Complex64_t *WORK )
 {
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( V, T, A, B );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztpmqrt( side,
                          trans,
                          M,
@@ -706,9 +748,10 @@ TCORE_ztpqrt( int                    M,
               CHAM_tile_t *          T,
               CHAMELEON_Complex64_t *WORK )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B, T );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztpqrt( M, N, L, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, CHAM_tile_get_ptr( T ), T->ld, WORK );
 }
 
@@ -722,6 +765,7 @@ TCORE_ztradd( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
     if (( A->format & CHAMELEON_TILE_DESC ) &&
         ( B->format & CHAMELEON_TILE_DESC ) )
     {
@@ -739,7 +783,8 @@ TCORE_ztrasm( cham_store_t       storev,
               const CHAM_tile_t *A,
               double *           work )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_ztrasm( storev, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work );
 }
 
@@ -754,8 +799,9 @@ TCORE_ztrmm( cham_side_t           side,
              const CHAM_tile_t *   A,
              CHAM_tile_t *         B )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( B->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_ztrmm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
 }
 
@@ -770,13 +816,16 @@ TCORE_ztrsm( cham_side_t           side,
              const CHAM_tile_t *   A,
              CHAM_tile_t *         B )
 {
-    if ( ( A->format & CHAMELEON_TILE_FULLRANK ) &&
-         ( B->format & CHAMELEON_TILE_FULLRANK ) )
+    coreblas_kernel_trace( A, B );
+
+    if ( ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
+         ( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) )
     {
         CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
     }
 #if defined( CHAMELEON_USE_HMAT )
     else if ( A->format & CHAMELEON_TILE_HMAT ) {
+        assert( !(B->format & CHAMELEON_TILE_DESC) );
         hmat_ztrsm( chameleon_lapack_const( side ),
                     chameleon_lapack_const( uplo ),
                     chameleon_lapack_const( transA ),
@@ -799,8 +848,9 @@ TCORE_ztrssq( cham_uplo_t        uplo,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( sclssq->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A, sclssq );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     double *W = CHAM_tile_get_ptr( sclssq );
     return CORE_ztrssq( uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, W, W + 1 );
 }
@@ -808,7 +858,7 @@ TCORE_ztrssq( cham_uplo_t        uplo,
 void
 TCORE_ztrtri( cham_uplo_t uplo, cham_diag_t diag, int N, CHAM_tile_t *A, int *info )
 {
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_ztrtri( uplo, diag, N, CHAM_tile_get_ptr( A ), A->ld, info );
 }
 
@@ -828,10 +878,11 @@ TCORE_ztsmlq_hetra1( cham_side_t            side,
                      CHAMELEON_Complex64_t *WORK,
                      int                    ldwork )
 {
-    assert( A1->format & CHAMELEON_TILE_FULLRANK );
-    assert( A2->format & CHAMELEON_TILE_FULLRANK );
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A1, A2, V, T );
+    assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztsmlq_hetra1( side,
                                trans,
                                m1,
@@ -868,10 +919,11 @@ TCORE_ztsmqr_hetra1( cham_side_t            side,
                      CHAMELEON_Complex64_t *WORK,
                      int                    ldwork )
 {
-    assert( A1->format & CHAMELEON_TILE_FULLRANK );
-    assert( A2->format & CHAMELEON_TILE_FULLRANK );
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( A1, A2, V, T );
+    assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztsmqr_hetra1( side,
                                trans,
                                m1,
@@ -905,9 +957,10 @@ TCORE_ztstrf( int                    M,
               int                    LDWORK,
               int *                  INFO )
 {
-    assert( U->format & CHAMELEON_TILE_FULLRANK );
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
-    assert( L->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( U, A, L );
+    assert( U->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_ztstrf(
         M, N, IB, NB, CHAM_tile_get_ptr( U ), U->ld, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( L ), L->ld, IPIV, WORK, LDWORK, INFO );
 }
@@ -925,9 +978,10 @@ TCORE_zunmlq( cham_side_t            side,
               CHAMELEON_Complex64_t *WORK,
               int                    LDWORK )
 {
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( V, T, C );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zunmlq(
         side, trans, M, N, K, IB, CHAM_tile_get_ptr( V ), V->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, LDWORK );
 }
@@ -945,9 +999,10 @@ TCORE_zunmqr( cham_side_t            side,
               CHAMELEON_Complex64_t *WORK,
               int                    LDWORK )
 {
-    assert( V->format & CHAMELEON_TILE_FULLRANK );
-    assert( T->format & CHAMELEON_TILE_FULLRANK );
-    assert( C->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( V, T, C );
+    assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zunmqr(
         side, trans, M, N, K, IB, CHAM_tile_get_ptr( V ), V->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, LDWORK );
 }
@@ -963,10 +1018,11 @@ TCORE_zgram( cham_uplo_t        uplo,
              const CHAM_tile_t *D,
              CHAM_tile_t *      A )
 {
-    assert( Di->format & CHAMELEON_TILE_FULLRANK );
-    assert( Dj->format & CHAMELEON_TILE_FULLRANK );
-    assert( D->format & CHAMELEON_TILE_FULLRANK );
-    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    coreblas_kernel_trace( Di, Dj, D, A );
+    assert( Di->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( D->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgram(
         uplo, M, N, Mt, Nt, CHAM_tile_get_ptr( Di ), Di->ld, CHAM_tile_get_ptr( Dj ), Dj->ld, CHAM_tile_get_ptr( D ), CHAM_tile_get_ptr( A ), A->ld );
 }
diff --git a/coreblas/compute/global.c b/coreblas/compute/global.c
index b370d72f8ab7b9dc604d8b478493c5f425aa3c72..7ce1142c0743561f725484f3cd5a91770e6243d3 100644
--- a/coreblas/compute/global.c
+++ b/coreblas/compute/global.c
@@ -20,8 +20,51 @@
  * @date 2020-03-03
  *
  */
+#include "coreblas.h"
+#include <stdarg.h>
+#include <stdlib.h>
+
+int _coreblas_silent = 0;
 static int coreblas_gemm3m_enabled = 0;
 
+__attribute__((unused)) __attribute__((constructor)) static void
+__coreblas_lib_init()
+{
+    char *silent = getenv("CHAMELEON_COREBLAS_SILENT");
+    if ( silent && !(strcmp( silent, "0" ) == 0) ) {
+        _coreblas_silent = 1;
+    }
+}
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+void __coreblas_kernel_trace( const char *func, ... )
+{
+    char output[1024];
+    int first = 1;
+    int size = 0;
+    int len = 1024;
+    va_list va_list;
+    const CHAM_tile_t *tile;
+
+    if (_coreblas_silent) {
+        return;
+    }
+
+    size += snprintf( output, len, "[coreblas] Execute %s(", func );
+
+    va_start( va_list, func );
+    while((tile = va_arg(va_list, const CHAM_tile_t*)) != 0) {
+        size += snprintf( output+size, len-size, "%s%s",
+                          first ? "" : ", ",
+                          tile->name );
+    }
+    va_end( va_list );
+
+    fprintf( stderr, "%s)\n", output );
+    fflush(stderr);
+}
+#endif
+
 void
 set_coreblas_gemm3m_enabled( int v ) {
     coreblas_gemm3m_enabled = v;
diff --git a/coreblas/include/coreblas.h b/coreblas/include/coreblas.h
index e203d81b2c812aff5499513dd141f1aa5f51b623..771f7856e69abe062fe5c4b3e627a90b109f3b49 100644
--- a/coreblas/include/coreblas.h
+++ b/coreblas/include/coreblas.h
@@ -87,6 +87,21 @@ extern char *chameleon_lapack_constants[];
 void set_coreblas_gemm3m_enabled( int v );
 int  get_coreblas_gemm3m_enabled( void );
 
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+
+void __coreblas_kernel_trace( const char *func, ... );
+#define coreblas_kernel_trace( ... )                                    \
+    do {                                                                \
+        __coreblas_kernel_trace( __chameleon_func__, ##__VA_ARGS__, NULL );  \
+    } while(0)
+
+#else
+
+#define coreblas_kernel_trace( ... ) do {} while(0)
+
+#endif
+
 END_C_DECLS
 
 #endif /* _coreblas_h_ */
diff --git a/include/chameleon.h b/include/chameleon.h
index 4a7a72bbda201e4e94389972a69255393021e1ce..49e9e4d06cf945c8c636a8717d5f44db5336dcea 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -140,6 +140,7 @@ int CHAMELEON_Desc_Acquire( const CHAM_desc_t *desc );
 int CHAMELEON_Desc_Release( const CHAM_desc_t *desc );
 int CHAMELEON_Desc_Flush  ( const CHAM_desc_t        *desc,
                             const RUNTIME_sequence_t *sequence );
+void CHAMELEON_Desc_Print( const CHAM_desc_t *desc );
 
 /* Workspaces */
 int CHAMELEON_Dealloc_Workspace (CHAM_desc_t **desc);
@@ -163,6 +164,10 @@ int CHAMELEON_Request_Create  (RUNTIME_request_t **request);
 int CHAMELEON_Request_Destroy (RUNTIME_request_t *request);
 int CHAMELEON_Request_Set     (RUNTIME_request_t *request, int param, int value);
 
+int  CHAMELEON_Recursive_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flttype_t dtyp,
+                                      int *mb, int *nb, int lm, int ln, int m, int n, int p, int q,
+                                      blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd, blkrankof_fct_t get_rankof );
+
 /**
  *
  * @ingroup Control
diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in
index 15073d1c4604947b704f0f3cf5ff55f15074012a..dc878d47b68f1a0e514ca9b9bf6f9c95de0e1ece 100644
--- a/include/chameleon/config.h.in
+++ b/include/chameleon/config.h.in
@@ -34,6 +34,8 @@
 
 /* Debug scheduling engine */
 #cmakedefine CHAMELEON_RUNTIME_SYNC
+/* Debug coreblas execution order if not provided by the runtime */
+#cmakedefine CHAMELEON_KERNELS_TRACE
 
 /* Communication engine */
 #cmakedefine CHAMELEON_USE_MPI
@@ -86,4 +88,10 @@
 #define END_C_DECLS   /* empty */
 #endif
 
+#ifdef _MSC_VER
+#define __chameleon_func__ __FUNCTION__
+#else
+#define __chameleon_func__ __func__
+#endif
+
 #endif /* CHAMELEON_CONFIG_H_HAS_BEEN_INCLUDED */
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index e1d424838324598f69fa1a72fe4ec5908d26617c..a43e46d699092a3308f7cfb8896fddc39e8457f2 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -35,6 +35,9 @@ BEGIN_C_DECLS
 #define CHAMELEON_TILE_HMAT     (1 << 2)
 
 typedef struct chameleon_tile_s {
+#if defined(CHAMELEON_KERNELS_TRACE)
+    char  *name;
+#endif
     int8_t format;
     int    m, n, ld;
     void  *mat;
@@ -67,6 +70,7 @@ typedef int          (*blkrankof_fct_t)( const CHAM_desc_t*, int, int );
 typedef CHAM_tile_t* (*blktile_fct_t)  ( const CHAM_desc_t*, int, int );
 
 struct chameleon_desc_s {
+    const char *name;
     // function to get chameleon tiles address
     blktile_fct_t  get_blktile;
     // function to get chameleon tiles address
diff --git a/include/chameleon/types.h b/include/chameleon/types.h
index 76f5bae73e656635de0b31081cec3db3ab5ba7b7..0a38ebb9a7a97e7eb7f6d8294826a990a592f1fe 100644
--- a/include/chameleon/types.h
+++ b/include/chameleon/types.h
@@ -126,6 +126,10 @@ static inline int chameleon_min( int a, int b ) {
     if ( a < b ) return a; else return b;
 }
 
+static inline int chameleon_ceil( int a, int b ) {
+    return ( a + b - 1 ) / b;
+}
+
 typedef double cham_fixdbl_t;
 
 END_C_DECLS
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index 66e6cc44a1d51bb61b1c4b60829c0ced06e3f678..5e5a2e5289825fde459a212e2ef54357eda94a85 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -25,15 +25,15 @@
 void RUNTIME_options_init( RUNTIME_option_t *option, CHAM_context_t *chamctxt,
                            RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
-    option->sequence   = sequence;
-    option->request    = request;
-    option->profiling  = CHAMELEON_PROFILING == CHAMELEON_TRUE;
-    option->parallel   = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
-    option->priority   = RUNTIME_PRIORITY_MIN;
-    option->ws_wsize   = 0;
-    option->ws_hsize   = 0;
-    option->ws_worker  = NULL;
-    option->ws_host    = NULL;
+    option->sequence  = sequence;
+    option->request   = request;
+    option->profiling = CHAMELEON_PROFILING == CHAMELEON_TRUE;
+    option->parallel  = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
+    option->priority  = RUNTIME_PRIORITY_MIN;
+    option->ws_wsize  = 0;
+    option->ws_hsize  = 0;
+    option->ws_worker = NULL;
+    option->ws_host   = NULL;
     return;
 }
 
diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c
index c377736025b31907b9be901f6a4209c49f1e6a63..740fdc0fba5bbdcde1ee0258821013d59836ed4b 100644
--- a/runtime/starpu/codelets/codelet_zlauum.c
+++ b/runtime/starpu/codelets/codelet_zlauum.c
@@ -39,7 +39,6 @@ cl_zlauum_cpu_func(void *descr[], void *cl_arg)
 {
     struct cl_zlauum_args_s clargs;
     CHAM_tile_t *tileA;
-    int info = 0;
 
     tileA = cti_interface_get(descr[0]);
 
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 2ed7183a7b627f2b3e78a5cc6bf641991bd57d80..9b802391aeb26b55c0968f7ebe7b5e281e67aa8f 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -241,8 +241,8 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc )
         for (n = 0; n < lnt; n++) {
             for (m = 0; m < lmt; m++)
             {
-                if (*handle != NULL) {
-                    starpu_data_unregister(*handle);
+                if ( *handle != NULL ) {
+                    starpu_data_unregister_submit(*handle);
                     *handle = NULL;
                 }
                 handle++;
@@ -384,14 +384,27 @@ void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence,
 {
     int64_t mm = m + (A->i / A->mb);
     int64_t nn = n + (A->j / A->nb);
-
+    int64_t shift = ((int64_t)A->lmt) * nn + mm;
     starpu_data_handle_t *handle = A->schedopt;
-    handle += ((int64_t)A->lmt) * nn + mm;
+    CHAM_tile_t          *tile   = A->tiles;
+    handle += shift;
+    tile   += shift;
 
     if (*handle == NULL) {
         return;
     }
 
+    /*
+     * TODO: check later, a better check would be to check if we
+     * partitionned the handle or not
+     *
+     * Right now, we can't flush a partitionned handle, we would need to
+     * unpartition first, so we flush only the children.
+     */
+    if ( tile->format & CHAMELEON_TILE_DESC ) {
+        CHAMELEON_Desc_Flush( tile->mat, sequence );
+    }
+
 #if defined(CHAMELEON_USE_MPI)
     starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
 #endif
diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c
index a15e0b78117814f8e2a8ee886f3c40668e914bb3..92d2d42036d0a525e76649d8319cccfe904771b0 100644
--- a/runtime/starpu/interface/cham_tile_interface.c
+++ b/runtime/starpu/interface/cham_tile_interface.c
@@ -19,7 +19,6 @@
 #include "chameleon_starpu.h"
 #if defined(CHAMELEON_USE_HMAT)
 #include "coreblas/hmat.h"
-#endif
 
 static inline void
 cti_hmat_destroy( starpu_cham_tile_interface_t *cham_tile_interface )
@@ -47,7 +46,7 @@ static inline size_t
 cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface )
 {
     size_t size = 0;
-#if defined(CHAMELEON_USE_HMAT)
+
     if ( (cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT) &&
          (cham_tile_interface->tile.mat != NULL ) )
     {
@@ -68,10 +67,15 @@ cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface )
             STARPU_ASSERT_MSG( 0, "cti_get_hmat_required_size(cham_tile_interface): unknown flttype\n" );
         }
     }
-#endif
-    (void)cham_tile_interface;
+
     return size;
 }
+#else
+static inline size_t
+cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface  __attribute__((unused)) ) {
+    return 0;
+}
+#endif
 
 static inline CHAM_tile_t *
 cti_handle_get( starpu_data_handle_t handle )
@@ -287,9 +291,16 @@ cti_display( starpu_data_handle_t handle, FILE *f )
     starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *)
         starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
+#if defined(CHAMELEON_KERNELS_TRACE)
+    fprintf( f, "%s{.m=%u,.n=%u}",
+             cham_tile_interface->tile.name,
+             cham_tile_interface->tile.m,
+             cham_tile_interface->tile.n );
+#else
     fprintf( f, "%u\t%u\t",
              cham_tile_interface->tile.m,
              cham_tile_interface->tile.n );
+#endif
 }
 
 static int
@@ -454,7 +465,6 @@ cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t cou
         starpu_data_get_interface_on_node(handle, node);
 
     char *tmp = ptr;
-    size_t size;
 
 #if defined(CHAMELEON_USE_MPI_DATATYPES)
     /*
@@ -467,6 +477,7 @@ cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t cou
 #else
     {
         CHAM_tile_t dsttile;
+        size_t size;
 
         /* Extract the size of the information to unpack */
         memcpy( &size, tmp, sizeof(size_t) );
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index ee99eb5983a823174d044869d947d45e4b0180b7..1079e455cd9f8f17ecb33d4e5a3317efdb8c50b3 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -45,6 +45,7 @@ set(CHAMELEON_SOURCES_TARGETS "${CHAMELEON_SOURCES_TARGETS};testing_include" CAC
 set(ZSRC
   chameleon_ztesting.c
   testing_zcheck.c
+  testing_zprint.c
   ##################
   # LAPACK
   ##################
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index 007d8013938501d3e90f079eef6bb60d4740d3e2..9ca61431a485b6eb9251d208ddf5d2581b549f53 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -26,7 +26,7 @@ if (NOT CHAMELEON_SIMULATION)
     #
     # Create the list of test based on precision and runtime
     #
-    set( TESTS lacpy lange lantr lansy plrnk )
+    set( TESTS print lacpy lange lantr lansy plrnk )
     if ( ${prec} STREQUAL c OR ${prec} STREQUAL z )
       set( TESTS ${TESTS} lanhe )
     endif()
diff --git a/testing/chameleon_ztesting.c b/testing/chameleon_ztesting.c
index 9384b7627f1573da57550c3f3bcdb0ce17a37e27..6aa508d5ac73892d9c51a15336b01238af2e70a3 100644
--- a/testing/chameleon_ztesting.c
+++ b/testing/chameleon_ztesting.c
@@ -64,6 +64,10 @@ static parameter_t parameters[] = {
     { "nb", "Tile size nb",       'b', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int },
     { "ib", "Inner tile size ib", 'i', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 2, TestValInt, {0}, NULL, pread_int, sprint_int },
 
+    { "l1", "Size of the first level of recursion",  '1', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "l2", "Size of the second level of recursion", '2', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "l3", "Size of the third level of recursion",  '3', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int },
+
     { "lda", "Leading dimension of the matrix A", 'A', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestValInt, {0}, NULL, pread_int, sprint_int },
     { "ldb", "Leading dimension of the matrix B", 'B', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestValInt, {0}, NULL, pread_int, sprint_int },
     { "ldc", "Leading dimension of the matrix C", 'C', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 5, TestValInt, {0}, NULL, pread_int, sprint_int },
diff --git a/testing/input/print.in b/testing/input/print.in
new file mode 100644
index 0000000000000000000000000000000000000000..64a60716cc7d056315c315480026a2b041fd4e29
--- /dev/null
+++ b/testing/input/print.in
@@ -0,0 +1,18 @@
+# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step]
+# Not given parameters will receive default values
+
+# LACPY
+# nb: Tile size
+# ib: Inner tile size
+# uplo: Part of the matrix to be copied (0 for Upper, 1 for Lower and 2 for UpperLower)
+# M: Number of rows of matrices A and C
+# N: Number of columns of matrices B and C
+# LDA: Leading dimension of matrix A
+# LDB: Leading dimension of matrix B
+
+op = print
+nb = 16, 17
+ib = 8
+m = 13:45:16
+n = 15:52:16
+lda = 65
diff --git a/testing/testing_zprint.c b/testing/testing_zprint.c
new file mode 100644
index 0000000000000000000000000000000000000000..1db0422641d87723b0bd15dc75619e79f3cf1b53
--- /dev/null
+++ b/testing/testing_zprint.c
@@ -0,0 +1,132 @@
+/**
+ *
+ * @file testing_zprint.c
+ *
+ * @copyright 2019-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zprint testing
+ *
+ * @version 1.1.0
+ * @author Lucas Barros de Assis
+ * @author Mathieu Faverge
+ * @date 2020-11-19
+ * @precisions normal z -> c d s
+ *
+ */
+#include <chameleon.h>
+#include "testings.h"
+#include "testing_zcheck.h"
+#include <chameleon/flops.h>
+
+/**
+ *  Internal function to return address of block (m,n) with m,n = block indices
+ */
+inline static void *chameleon_getaddr_cm(const CHAM_desc_t *A, int m, int n)
+{
+    size_t mm = m + A->i / A->mb;
+    size_t nn = n + A->j / A->nb;
+    size_t eltsize = CHAMELEON_Element_Size(A->dtyp);
+    size_t offset = 0;
+
+#if defined(CHAMELEON_USE_MPI)
+    assert( A->myrank == A->get_rankof( A, mm, nn) );
+    mm = mm / A->p;
+    nn = nn / A->q;
+#endif
+
+    offset = (size_t)(A->llm * A->nb) * nn + (size_t)(A->mb) * mm;
+    return (void*)((intptr_t)A->mat + (offset*eltsize) );
+}
+
+inline static int chameleon_getblkldd_cm(const CHAM_desc_t *A, int m) {
+    (void)m;
+    return A->llm;
+}
+
+int
+testing_zprint( run_arg_list_t *args, int check )
+{
+    int          hres = 0;
+    CHAM_desc_t *descA;
+
+    /* Reads arguments */
+    intptr_t    mtxfmt = parameters_getvalue_int( "mtxfmt" );
+    int         nb     = run_arg_get_int( args, "nb", 320 );
+    int         P      = parameters_getvalue_int( "P" );
+    int         N      = run_arg_get_int( args, "N", 1000 );
+    int         M      = run_arg_get_int( args, "M", N );
+    int         LDA    = run_arg_get_int( args, "LDA", M );
+    int         l1    = run_arg_get_int( args, "l1", nb / 2 );
+    int         l2    = run_arg_get_int( args, "l2", l1 / 3 );
+    int         l3    = run_arg_get_int( args, "l3", l2 / 2 );
+    int         Q      = parameters_compute_q( P );
+
+    int list_nb[] = { nb, l1, l2, l3, 0 };
+
+    CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb );
+
+    fprintf( stdout, "--- Tile layout ---\n" );
+    CHAMELEON_Desc_Create(
+        &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q );
+
+    CHAMELEON_Desc_Print( descA );
+
+    CHAMELEON_Desc_Destroy( &descA );
+
+    fprintf( stdout, "--- Lapacke layout ---\n" );
+    CHAMELEON_Desc_Create_User(
+        &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q,
+        chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL );
+
+    CHAMELEON_Desc_Print( descA );
+    CHAMELEON_Desc_Destroy( &descA );
+
+    fprintf( stdout, "--- Recursive layout (Tile)---\n" );
+    CHAMELEON_Recursive_Desc_Create(
+        &descA, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble,
+        list_nb, list_nb, LDA, N, M, N, P, Q,
+        NULL, NULL, NULL );
+
+    CHAMELEON_Desc_Print( descA );
+    CHAMELEON_Desc_Destroy( &descA );
+
+    fprintf( stdout, "--- Recursive layout (Lapack) ---\n" );
+    CHAMELEON_Recursive_Desc_Create(
+        &descA, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble,
+        list_nb, list_nb, LDA, N, M, N, P, Q,
+        chameleon_getaddr_cm, chameleon_getblkldd_cm, NULL );
+
+    CHAMELEON_Desc_Print( descA );
+    CHAMELEON_Desc_Destroy( &descA );
+
+    run_arg_add_fixdbl( args, "time", 1. );
+    run_arg_add_fixdbl( args, "gflops", 1. );
+
+    return hres;
+}
+
+testing_t   test_zprint;
+const char *zprint_params[] = { "mtxfmt", "nb", "l1", "l2", "l3", "m", "n", "lda", NULL };
+const char *zprint_output[] = { NULL };
+const char *zprint_outchk[] = { "RETURN", NULL };
+
+/**
+ * @brief Testing registration function
+ */
+void testing_zprint_init( void ) __attribute__( ( constructor ) );
+void
+testing_zprint_init( void )
+{
+    test_zprint.name        = "zprint";
+    test_zprint.helper      = "Print descriptors";
+    test_zprint.params      = zprint_params;
+    test_zprint.output      = zprint_output;
+    test_zprint.outchk      = zprint_outchk;
+    test_zprint.fptr        = testing_zprint;
+    test_zprint.next        = NULL;
+
+    testing_register( &test_zprint );
+}