From 8b915c8058f56dcd4470888445fc39b00d91d38f Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 12 May 2021 17:45:07 +0200
Subject: [PATCH] coreblas/trace: add CHAMELEON_KERNELS_TRACE option to name
 the tiles and print information about the exectued kernels

---
 CMakeLists.txt                |  5 +++
 control/descriptor.c          | 15 +++++++
 coreblas/compute/core_ztile.c | 73 ++++++++++++++++++++++++++++++-----
 coreblas/compute/global.c     | 33 ++++++++++++++++
 coreblas/include/coreblas.h   | 15 +++++++
 include/chameleon/config.h.in |  8 ++++
 include/chameleon/struct.h    |  3 ++
 7 files changed, 143 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 770b404d8..8dbc5f4b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,11 @@ if (CHAMELEON_RUNTIME_SYNC)
     message("-- ${BoldGreen}CHAMELEON_RUNTIME_SYNC is set to ON, turn it OFF to avoid synchronisation in the tasks submission${ColourReset}")
 endif()
 
+option(CHAMELEON_KERNELS_TRACE "Enable kernel traces to debug the task execution order" OFF)
+if (CHAMELEON_KERNELS_TRACE)
+    message("-- ${BoldGreen}CHAMELEON_KERNELS_TRACE is set to ON, turn it OFF to get better perfomance${ColourReset}")
+endif()
+
 # Options to enable/disable doc, examples, and testings
 # -----------------------------------------------------
 option(CHAMELEON_ENABLE_DOC      "Enable documentation build"  OFF)
diff --git a/control/descriptor.c b/control/descriptor.c
index 8d65d4cf6..5af45af7f 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -26,6 +26,7 @@
  * @brief Group descriptor routines exposed to users
  *
  */
+#define _GNU_SOURCE 1
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
@@ -71,6 +72,17 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc )
     }
 
     if ( desc->tiles ) {
+#if defined(CHAMELEON_KERNELS_TRACE)
+        CHAM_tile_t *tile = desc->tiles;
+        int ii, jj;
+        for( jj=0; jj<desc->lnt; jj++ ) {
+            for( ii=0; ii<desc->lmt; ii++, tile++ ) {
+                if ( tile->name ) {
+                    free( tile->name );
+                }
+            }
+        }
+#endif
         free( desc->tiles );
     }
     return CHAMELEON_SUCCESS;
@@ -92,6 +104,9 @@ void chameleon_desc_init_tiles( CHAM_desc_t *desc )
             tile->n   = jj == desc->lnt-1 ? desc->ln - jj * desc->nb : desc->nb;
             tile->mat = (rank == desc->myrank) ? desc->get_blkaddr( desc, ii, jj ) : NULL;
             tile->ld  = desc->get_blkldd( desc, ii );
+#if defined(CHAMELEON_KERNELS_TRACE)
+            asprintf( &(tile->name), "%s(%d,%d)", desc->name, ii, jj);
+#endif
         }
     }
 }
diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c
index 20236eb4c..6c28b72d3 100644
--- a/coreblas/compute/core_ztile.c
+++ b/coreblas/compute/core_ztile.c
@@ -28,6 +28,7 @@ TCORE_dlag2z( cham_uplo_t uplo, int M, int N,
               const CHAM_tile_t *A,
               CHAM_tile_t       *B )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & CHAMELEON_TILE_FULLRANK );
     assert( B->format & CHAMELEON_TILE_FULLRANK );
     CORE_dlag2z( uplo, M, N, A->mat, A->ld, B->mat, B->ld );
@@ -42,6 +43,7 @@ TCORE_dzasum( cham_store_t       storev,
               const CHAM_tile_t *A,
               double *           work )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_dzasum( storev, uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, work );
 }
@@ -54,6 +56,7 @@ TCORE_zaxpy( int                   M,
              CHAM_tile_t *         B,
              int                   incB )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zaxpy( M, alpha, CHAM_tile_get_ptr( A ), incA, CHAM_tile_get_ptr( B ), incB );
@@ -68,6 +71,7 @@ TCORE_zgeadd( cham_trans_t          trans,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
     if ( (A->format & CHAMELEON_TILE_DESC) &&
          (B->format & CHAMELEON_TILE_DESC) )
     {
@@ -88,6 +92,7 @@ TCORE_zgelqt( int                    M,
               CHAMELEON_Complex64_t *TAU,
               CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( A, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgelqt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK );
@@ -99,6 +104,7 @@ TCORE_zgemv( cham_trans_t trans, int M, int N,
                                           const CHAM_tile_t *x, int incX,
              CHAMELEON_Complex64_t beta,        CHAM_tile_t *y, int incY )
 {
+    coreblas_kernel_trace( A, x, y );
     assert( A->format & CHAMELEON_TILE_FULLRANK );
     assert( x->format & CHAMELEON_TILE_FULLRANK );
     assert( y->format & CHAMELEON_TILE_FULLRANK );
@@ -118,6 +124,7 @@ TCORE_zgemm( cham_trans_t          transA,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
+    coreblas_kernel_trace( A, B, C );
     if ( ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
          ( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
          ( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) )
@@ -164,6 +171,7 @@ TCORE_zgeqrt( int                    M,
               CHAMELEON_Complex64_t *TAU,
               CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( A, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgeqrt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK );
@@ -172,6 +180,7 @@ TCORE_zgeqrt( int                    M,
 int
 TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A )
 {
+    coreblas_kernel_trace( L, A );
     assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgessm( M, N, K, IB, IPIV, CHAM_tile_get_ptr( L ), L->ld, CHAM_tile_get_ptr( A ), A->ld );
@@ -180,6 +189,7 @@ TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L
 int
 TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq )
 {
+    coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zgessq( storev, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
@@ -188,6 +198,8 @@ TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile
 int
 TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO )
 {
+    coreblas_kernel_trace( A );
+
     int rc = -1;
     if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         rc = CORE_zgetrf( M, N, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO );
@@ -207,6 +219,8 @@ TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO )
 int
 TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO )
 {
+    coreblas_kernel_trace( A );
+
     if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         return CORE_zgetrf_incpiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO );
     }
@@ -224,6 +238,8 @@ TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO
 int
 TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO )
 {
+    coreblas_kernel_trace( A );
+
     int rc = -1;
     *INFO  = 0;
 
@@ -245,6 +261,7 @@ TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO )
 void
 TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zhe2ge( uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
@@ -278,6 +295,7 @@ TCORE_zherk( cham_uplo_t        uplo,
              double             beta,
              CHAM_tile_t *      C )
 {
+    coreblas_kernel_trace( A, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zherk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
@@ -294,6 +312,7 @@ TCORE_zher2k( cham_uplo_t           uplo,
               double                beta,
               CHAM_tile_t *         C )
 {
+    coreblas_kernel_trace( A, B, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -313,6 +332,7 @@ TCORE_zherfb( cham_uplo_t            uplo,
               CHAMELEON_Complex64_t *WORK,
               int                    ldwork )
 {
+    coreblas_kernel_trace( A, T, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -328,6 +348,7 @@ TCORE_zhessq( cham_store_t       storev,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
+    coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zhessq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
@@ -353,6 +374,7 @@ TCORE_zlange( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlange( norm, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
@@ -366,6 +388,7 @@ TCORE_zlanhe( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlanhe( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
@@ -379,6 +402,7 @@ TCORE_zlansy( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlansy( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
@@ -393,6 +417,7 @@ TCORE_zlantr( cham_normtype_t    norm,
               double *           work,
               double *           normA )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlantr( norm, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA );
 }
@@ -400,6 +425,7 @@ TCORE_zlantr( cham_normtype_t    norm,
 int
 TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zlascal( uplo, m, n, alpha, CHAM_tile_get_ptr( A ), A->ld );
 }
@@ -412,6 +438,7 @@ TCORE_zlaset( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         A )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlaset( uplo, n1, n2, alpha, beta, CHAM_tile_get_ptr( A ), A->ld );
 }
@@ -419,6 +446,7 @@ TCORE_zlaset( cham_uplo_t           uplo,
 void
 TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlaset2( uplo, n1, n2, alpha, CHAM_tile_get_ptr( A ), A->ld );
 }
@@ -431,6 +459,7 @@ TCORE_zlatro( cham_uplo_t        uplo,
               const CHAM_tile_t *A,
               CHAM_tile_t *      B )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zlatro( uplo, trans, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
@@ -439,6 +468,7 @@ TCORE_zlatro( cham_uplo_t        uplo,
 void
 TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zlauum( uplo, N, CHAM_tile_get_ptr( A ), A->ld );
 }
@@ -448,14 +478,14 @@ void
 TCORE_zplghe( double                 bump,
               int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
-    CORE_zplghe( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplghe( bump, m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 #endif
 
@@ -463,32 +493,35 @@ void
 TCORE_zplgsy( CHAMELEON_Complex64_t  bump,
               int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
-    CORE_zplgsy( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplgsy( bump, m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 
 void
 TCORE_zplrnt( int                    m,
               int                    n,
-              CHAM_tile_t *          tileA,
+              CHAM_tile_t *          A,
               int                    bigM,
               int                    m0,
               int                    n0,
               unsigned long long int seed )
 {
-    assert( tileA->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
-    CORE_zplrnt( m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed );
+    coreblas_kernel_trace( A );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    CORE_zplrnt( m, n, CHAM_tile_get_ptr( A ), A->ld, bigM, m0, n0, seed );
 }
 
 void
 TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO )
 {
+    coreblas_kernel_trace( A );
     if ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) {
         CORE_zpotrf( uplo, n, CHAM_tile_get_ptr( A ), A->ld, INFO );
     }
@@ -517,6 +550,7 @@ TCORE_zssssm( int                M1,
               const CHAM_tile_t *L2,
               const int *        IPIV )
 {
+    coreblas_kernel_trace( A1, A2, L1, L2 );
     assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( L1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -549,6 +583,7 @@ TCORE_zsymm( cham_side_t           side,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
+    coreblas_kernel_trace( A, B, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -565,6 +600,7 @@ TCORE_zsyrk( cham_uplo_t           uplo,
              CHAMELEON_Complex64_t beta,
              CHAM_tile_t *         C )
 {
+    coreblas_kernel_trace( A, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_zsyrk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld );
@@ -581,6 +617,7 @@ TCORE_zsyr2k( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         C )
 {
+    coreblas_kernel_trace( A, B, C );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -594,6 +631,7 @@ TCORE_zsyssq( cham_store_t       storev,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
+    coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zsyssq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) );
@@ -603,6 +641,7 @@ TCORE_zsyssq( cham_store_t       storev,
 int
 TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     return CORE_zsytf2_nopiv( uplo, n, CHAM_tile_get_ptr( A ), A->ld );
 }
@@ -618,6 +657,7 @@ TCORE_ztplqt( int                    M,
               CHAM_tile_t *          T,
               CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( A, B, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -638,6 +678,7 @@ TCORE_ztpmlqt( cham_side_t            side,
                CHAM_tile_t *          B,
                CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( V, T, A, B );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -674,6 +715,7 @@ TCORE_ztpmqrt( cham_side_t            side,
                CHAM_tile_t *          B,
                CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( V, T, A, B );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -706,6 +748,7 @@ TCORE_ztpqrt( int                    M,
               CHAM_tile_t *          T,
               CHAMELEON_Complex64_t *WORK )
 {
+    coreblas_kernel_trace( A, B, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -722,6 +765,7 @@ TCORE_ztradd( cham_uplo_t           uplo,
               CHAMELEON_Complex64_t beta,
               CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
     if (( A->format & CHAMELEON_TILE_DESC ) &&
         ( B->format & CHAMELEON_TILE_DESC ) )
     {
@@ -739,6 +783,7 @@ TCORE_ztrasm( cham_store_t       storev,
               const CHAM_tile_t *A,
               double *           work )
 {
+    coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_ztrasm( storev, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work );
 }
@@ -754,6 +799,7 @@ TCORE_ztrmm( cham_side_t           side,
              const CHAM_tile_t *   A,
              CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     CORE_ztrmm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld );
@@ -770,6 +816,8 @@ TCORE_ztrsm( cham_side_t           side,
              const CHAM_tile_t *   A,
              CHAM_tile_t *         B )
 {
+    coreblas_kernel_trace( A, B );
+
     if ( ( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) &&
          ( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ) )
     {
@@ -800,6 +848,7 @@ TCORE_ztrssq( cham_uplo_t        uplo,
               const CHAM_tile_t *A,
               CHAM_tile_t *      sclssq )
 {
+    coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     double *W = CHAM_tile_get_ptr( sclssq );
@@ -829,6 +878,7 @@ TCORE_ztsmlq_hetra1( cham_side_t            side,
                      CHAMELEON_Complex64_t *WORK,
                      int                    ldwork )
 {
+    coreblas_kernel_trace( A1, A2, V, T );
     assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -869,6 +919,7 @@ TCORE_ztsmqr_hetra1( cham_side_t            side,
                      CHAMELEON_Complex64_t *WORK,
                      int                    ldwork )
 {
+    coreblas_kernel_trace( A1, A2, V, T );
     assert( A1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -906,6 +957,7 @@ TCORE_ztstrf( int                    M,
               int                    LDWORK,
               int *                  INFO )
 {
+    coreblas_kernel_trace( U, A, L );
     assert( U->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -926,6 +978,7 @@ TCORE_zunmlq( cham_side_t            side,
               CHAMELEON_Complex64_t *WORK,
               int                    LDWORK )
 {
+    coreblas_kernel_trace( V, T, C );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -946,6 +999,7 @@ TCORE_zunmqr( cham_side_t            side,
               CHAMELEON_Complex64_t *WORK,
               int                    LDWORK )
 {
+    coreblas_kernel_trace( V, T, C );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
@@ -964,6 +1018,7 @@ TCORE_zgram( cham_uplo_t        uplo,
              const CHAM_tile_t *D,
              CHAM_tile_t *      A )
 {
+    coreblas_kernel_trace( Di, Dj, D, A );
     assert( Di->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( D->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
diff --git a/coreblas/compute/global.c b/coreblas/compute/global.c
index b370d72f8..f92498976 100644
--- a/coreblas/compute/global.c
+++ b/coreblas/compute/global.c
@@ -20,8 +20,41 @@
  * @date 2020-03-03
  *
  */
+#include "coreblas.h"
+#include <stdarg.h>
+
+int _coreblas_silent = 0;
 static int coreblas_gemm3m_enabled = 0;
 
+#if defined(CHAMELEON_KERNELS_TRACE)
+void __coreblas_kernel_trace( const char *func, ... )
+{
+    char output[1024];
+    int first = 1;
+    int size = 0;
+    int len = 1024;
+    va_list va_list;
+    const CHAM_tile_t *tile;
+
+    if (_coreblas_silent) {
+        return;
+    }
+
+    size += snprintf( output, len, "[coreblas] Execute %s(", func );
+
+    va_start( va_list, func );
+    while((tile = va_arg(va_list, const CHAM_tile_t*)) != 0) {
+        size += snprintf( output+size, len-size, "%s%s",
+                          first ? "" : ", ",
+                          tile->name );
+    }
+    va_end( va_list );
+
+    fprintf( stderr, "%s)\n", output );
+    fflush(stderr);
+}
+#endif
+
 void
 set_coreblas_gemm3m_enabled( int v ) {
     coreblas_gemm3m_enabled = v;
diff --git a/coreblas/include/coreblas.h b/coreblas/include/coreblas.h
index e203d81b2..771f7856e 100644
--- a/coreblas/include/coreblas.h
+++ b/coreblas/include/coreblas.h
@@ -87,6 +87,21 @@ extern char *chameleon_lapack_constants[];
 void set_coreblas_gemm3m_enabled( int v );
 int  get_coreblas_gemm3m_enabled( void );
 
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+
+void __coreblas_kernel_trace( const char *func, ... );
+#define coreblas_kernel_trace( ... )                                    \
+    do {                                                                \
+        __coreblas_kernel_trace( __chameleon_func__, ##__VA_ARGS__, NULL );  \
+    } while(0)
+
+#else
+
+#define coreblas_kernel_trace( ... ) do {} while(0)
+
+#endif
+
 END_C_DECLS
 
 #endif /* _coreblas_h_ */
diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in
index 15073d1c4..dc878d47b 100644
--- a/include/chameleon/config.h.in
+++ b/include/chameleon/config.h.in
@@ -34,6 +34,8 @@
 
 /* Debug scheduling engine */
 #cmakedefine CHAMELEON_RUNTIME_SYNC
+/* Debug coreblas execution order if not provided by the runtime */
+#cmakedefine CHAMELEON_KERNELS_TRACE
 
 /* Communication engine */
 #cmakedefine CHAMELEON_USE_MPI
@@ -86,4 +88,10 @@
 #define END_C_DECLS   /* empty */
 #endif
 
+#ifdef _MSC_VER
+#define __chameleon_func__ __FUNCTION__
+#else
+#define __chameleon_func__ __func__
+#endif
+
 #endif /* CHAMELEON_CONFIG_H_HAS_BEEN_INCLUDED */
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index ec3ea45be..a43e46d69 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -35,6 +35,9 @@ BEGIN_C_DECLS
 #define CHAMELEON_TILE_HMAT     (1 << 2)
 
 typedef struct chameleon_tile_s {
+#if defined(CHAMELEON_KERNELS_TRACE)
+    char  *name;
+#endif
     int8_t format;
     int    m, n, ld;
     void  *mat;
-- 
GitLab