From 1841686627f2d91cfa6234ea0ea947140557c4dc Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 1 Dec 2016 16:54:43 +0000
Subject: [PATCH] It seems that almost everything is back

---
 CMakeLists.txt                                |  8 +-
 compute/CMakeLists.txt                        | 12 +++
 compute/pzgelqf.c                             |  8 +-
 compute/pzgeqrfrh.c                           |  2 -
 control/common.h                              |  7 --
 control/compute_z.h                           |  2 +-
 control/control.c                             |  3 +-
 control/descriptor.h                          |  1 +
 cudablas/compute/CMakeLists.txt               | 25 +++---
 cudablas/compute/cuda_zgemerge.c              | 79 +++----------------
 cudablas/compute/cuda_ztsmlq.c                |  2 +-
 runtime/starpu/codelets/codelet_zgelqt.c      |  2 +-
 runtime/starpu/codelets/codelet_zgemm.c       |  6 +-
 runtime/starpu/codelets/codelet_zgeqrt.c      |  5 +-
 runtime/starpu/codelets/codelet_zgessm.c      |  4 +-
 .../starpu/codelets/codelet_zgetrf_incpiv.c   |  5 +-
 .../starpu/codelets/codelet_zgetrf_nopiv.c    |  4 +-
 runtime/starpu/codelets/codelet_zhemm.c       |  6 +-
 runtime/starpu/codelets/codelet_zher2k.c      |  6 +-
 runtime/starpu/codelets/codelet_zherk.c       |  6 +-
 runtime/starpu/codelets/codelet_zlauum.c      |  4 +-
 runtime/starpu/codelets/codelet_zpotrf.c      |  4 +-
 runtime/starpu/codelets/codelet_zssssm.c      |  4 +-
 runtime/starpu/codelets/codelet_zsymm.c       |  6 +-
 runtime/starpu/codelets/codelet_zsyr2k.c      |  6 +-
 runtime/starpu/codelets/codelet_zsyrk.c       |  6 +-
 runtime/starpu/codelets/codelet_ztrmm.c       |  6 +-
 runtime/starpu/codelets/codelet_ztrsm.c       |  6 +-
 runtime/starpu/codelets/codelet_ztrtri.c      |  4 +-
 runtime/starpu/codelets/codelet_ztslqt.c      |  4 +-
 runtime/starpu/codelets/codelet_ztsmlq.c      |  4 +-
 runtime/starpu/codelets/codelet_ztsmqr.c      |  5 +-
 runtime/starpu/codelets/codelet_ztsqrt.c      |  5 +-
 runtime/starpu/codelets/codelet_ztstrf.c      |  5 +-
 runtime/starpu/codelets/codelet_zunmlq.c      |  4 +-
 runtime/starpu/codelets/codelet_zunmqr.c      |  4 +-
 runtime/starpu/control/runtime_descriptor.c   | 17 ++--
 runtime/starpu/control/runtime_workspace.c    |  8 +-
 38 files changed, 95 insertions(+), 200 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb17b06b3..ff0813519 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,15 +161,15 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/")
     endif()
 
     # Use intermediate variable since cmake_dependent_option doesn't have OR conditions
-    set(CHAMELEON_ENABLE_MPI OFF CACHE BOOL "Tells if MPI might be supported by the runtime")
+    set(CHAMELEON_ENABLE_MPI OFF CACHE INTERNAL "Tells if MPI might be supported by the runtime")
     if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU )
-         set(CHAMELEON_ENABLE_MPI ON)
+         set(CHAMELEON_ENABLE_MPI ON FORCE)
     endif()
 
     # Use intermediate variable since cmake_dependent_option doesn't have OR conditions
-    set(CHAMELEON_ENABLE_CUDA OFF CACHE BOOL "Tells if CUDA might be supported by the runtime")
+    set(CHAMELEON_ENABLE_CUDA OFF CACHE INTERNAL "Tells if CUDA might be supported by the runtime")
     if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU )
-         set(CHAMELEON_ENABLE_CUDA ON)
+         set(CHAMELEON_ENABLE_CUDA ON FORCE)
     endif()
 
     # Additional options
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 165e60480..229ff8102 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -26,6 +26,18 @@
 #
 ###
 
+option(CHAMELEON_COPY_DIAG
+  "This options enables the duplication of the diagonal tiles in some algorithm to avoid anti-dependencies on lower/upper triangular parts (Might be useful to StarPU)" ON)
+mark_as_advanced(CHAMELEON_COPY_DIAG)
+
+if ( CHAMELEON_SCHED_QUARK )
+  # No need for those extra diagonale tiles
+  set( CHAMELEON_COPY_DIAG OFF )
+endif()
+
+if (CHAMELEON_COPY_DIAG)
+  add_definitions(-DCHAMELEON_COPY_DIAG)
+endif()
 
 # Define the list of sources
 # --------------------------
diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 74a07ffba..99c5a88c7 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -53,7 +53,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
     int k, m, n;
     int ldak, ldam;
     int tempkm, tempkn, tempmm, tempnn;
-    int ib;
+    int ib, minMT;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -62,6 +62,12 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     ib = MORSE_IB;
 
+    if (A->m > A->n) {
+        minMT = A->nt;
+    } else {
+        minMT = A->mt;
+    }
+
     /*
      * zgelqt = A->nb * (ib+1)
      * zunmlq = A->nb * ib
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index 199539ad5..e4c1ba524 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -87,9 +87,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
      */
     ws_worker = max( ws_worker, ib * (ib + A->nb) );
     ws_worker = max( ws_worker, ib * A->nb * 2 );
-#endif
 
-#if defined(CHAMELEON_USE_MAGMA)
     /* Host space
      *
      * zgeqrt = ib * (A->nb+3*ib) + A->nb )
diff --git a/control/common.h b/control/common.h
index 1f52e0567..7f9a6beaf 100644
--- a/control/common.h
+++ b/control/common.h
@@ -143,13 +143,6 @@
 #define MORSE_MPI_SIZE    morse->mpi_comm_size
 #endif
 
-/*******************************************************************************
- *  Activate copy of diagonal tile (StarPU only) for some tile algorithms (pz)
- **/
-#if defined(CHAMELEON_SCHED_STARPU)
-#define CHAMELEON_COPY_DIAG
-#endif
-
 /*******************************************************************************
  *  IPT internal define
  **/
diff --git a/control/compute_z.h b/control/compute_z.h
index 78656825a..53ee78e87 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -102,7 +102,7 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_seq
 void morse_pzgetrf_nopiv(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgetrf_reclap(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgetrf_rectil(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzhbcpy_t2bl(MORSE_enum uplo, MORSE_desc_t *A, MORSE_Complex64_t *AB, int LDAB, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzhbcpy_t2bl(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *AB, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzhegst(MORSE_enum itype, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 #ifdef COMPLEX
 void morse_pzhemm(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
diff --git a/control/control.c b/control/control.c
index 900551a56..8098178bb 100644
--- a/control/control.c
+++ b/control/control.c
@@ -145,12 +145,13 @@ int MORSE_Finalize(void)
 #if defined(CHAMELEON_USE_MAGMA) && !defined(CHAMELEON_SIMULATION)
     magma_finalize();
 #endif
-    morse_context_destroy();
+
 #if defined(CHAMELEON_USE_MPI)
     if (!morse->mpi_outer_init)
         MPI_Finalize();
 #endif
 
+    morse_context_destroy();
     return MORSE_SUCCESS;
 }
 
diff --git a/control/descriptor.h b/control/descriptor.h
index cea048776..09d3825ae 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -177,6 +177,7 @@ inline static int morse_getrankof_2d(const MORSE_desc_t *desc, int m, int n)
  **/
 inline static int morse_getrankof_2d_diag(const MORSE_desc_t *desc, int m, int n)
 {
+    assert( n == 0 );
     return (m % desc->p) * desc->q + (m % desc->q);
 }
 
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 91550a049..7ef91aee0 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -27,33 +27,40 @@
 # ------------------------------------------------------
 set(CUDABLAS_SRCS_GENERATED "")
 set(ZSRC
-    cuda_zgelqt.c
     cuda_zgemerge.c
     cuda_zgemm.c
-    cuda_zgeqrt.c
-    cuda_zgessm.c
-    cuda_zgetrf.c
     cuda_zhemm.c
     cuda_zher2k.c
     cuda_zherk.c
-    cuda_zlauum.c
     cuda_zparfb.c
-    cuda_zpotrf.c
-    cuda_zssssm.c
     cuda_zsymm.c
     cuda_zsyr2k.c
     cuda_zsyrk.c
     cuda_ztrmm.c
     cuda_ztrsm.c
-    cuda_ztrtri.c
-    cuda_ztslqt.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    )
+
+if( CHAMELEON_USE_MAGMA )
+  set(ZSRC
+    ${ZSRC}
+    cuda_zgelqt.c
+    cuda_zgeqrt.c
+    cuda_zgessm.c
+    cuda_zgetrf.c
+    cuda_zlauum.c
+    cuda_zparfb.c
+    cuda_zpotrf.c
+    cuda_zssssm.c
+    cuda_ztrtri.c
+    cuda_ztslqt.c
     cuda_ztsqrt.c
     cuda_ztstrf.c
     cuda_zunmlqt.c
     cuda_zunmqrt.c
     )
+endif()
 
 precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}"
                     PRECISIONS "${CHAMELEON_PRECISION}")
diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c
index 76b0771ca..f04561eaf 100644
--- a/cudablas/compute/cuda_zgemerge.c
+++ b/cudablas/compute/cuda_zgemerge.c
@@ -24,28 +24,15 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
-#if defined(CHAMELEON_USE_CUBLAS_V2)
-int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag,
-		int M, int N, cuDoubleComplex *A, int LDA,
-		cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM)
+int
+CUDA_zgemerge( MORSE_enum side, MORSE_enum diag,
+               int M, int N,
+               cuDoubleComplex *A, int LDA,
+               cuDoubleComplex *B, int LDB,
+               CUBLAS_STREAM_PARAM)
 {
     int i, j;
-    magmaDoubleComplex *cola, *colb;
-    cublasHandle_t handle;
-    cublasStatus_t stat;
-
-    stat = cublasCreate(&handle);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("CUBLAS initialization failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
-
-    stat = cublasSetStream(handle, stream);
-    if (stat != CUBLAS_STATUS_SUCCESS) {
-        printf ("cublasSetStream failed\n");
-        assert( stat == CUBLAS_STATUS_SUCCESS );
-    }
+    cuDoubleComplex *cola, *colb;
 
     if (M < 0) {
         return -1;
@@ -60,55 +47,9 @@ int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag,
         return -7;
     }
 
-    if (side == MagmaLeft){
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(handle, i+1, cola, 1, colb, 1);
-            cudaMemcpyAsync(colb , cola,
-                            (i+1)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }else{
-        for(i=0; i<N; i++){
-            cola = A + i*LDA;
-            colb = B + i*LDB;
-//            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
-            cudaMemcpyAsync(colb+i , cola+i,
-                            (M-i)*sizeof(cuDoubleComplex),
-                            cudaMemcpyDeviceToDevice, stream);
-        }
-    }
-
-    cublasDestroy(handle);
-
-    return MORSE_SUCCESS;
-}
-#else /* CHAMELEON_USE_CUBLAS_V2 */
-int CUDA_zgemerge(
-        magma_side_t side, magma_diag_t diag,
-        magma_int_t M, magma_int_t N,
-        magmaDoubleComplex *A, magma_int_t LDA,
-        magmaDoubleComplex *B, magma_int_t LDB,
-        CUstream stream)
-{
-    int i, j;
-    magmaDoubleComplex *cola, *colb;
-
-    if (M < 0) {
-        return -1;
-    }
-    if (N < 0) {
-        return -2;
-    }
-    if ( (LDA < max(1,M)) && (M > 0) ) {
-        return -5;
-    }
-    if ( (LDB < max(1,M)) && (M > 0) ) {
-        return -7;
-    }
+    CUBLAS_GET_STREAM;
 
-    if (side == MagmaLeft){
+    if (side == MorseLeft){
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
@@ -128,5 +69,3 @@ int CUDA_zgemerge(
 
     return MORSE_SUCCESS;
 }
-#endif /* CHAMELEON_USE_CUBLAS_V2 */
-#endif
diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c
index 6c525138a..0a44d5800 100644
--- a/cudablas/compute/cuda_ztsmlq.c
+++ b/cudablas/compute/cuda_ztsmlq.c
@@ -55,7 +55,7 @@ int CUDA_ztsmlq(
         NW = IB;
     }
     else {
-        NW = M1;
+        NW = N1;
     }
 
     if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index 987b9264a..8e7dbdcba 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -194,8 +194,8 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif //defined(CHAMELEON_USE_MAGMA)
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index da14986cd..1ff16ebf7 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -129,11 +129,9 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg)
         B, ldb,
         beta, C, ldc);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum transA;
@@ -172,9 +170,7 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum transA;
@@ -213,9 +209,9 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 88675d977..c375bf281 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -151,10 +151,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + max( m, n );
     CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
+
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -196,8 +195,8 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif //defined(CHAMELEON_USE_MAGMA)
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c
index 3c7c1f052..a32507894 100644
--- a/runtime/starpu/codelets/codelet_zgessm.c
+++ b/runtime/starpu/codelets/codelet_zgessm.c
@@ -133,10 +133,8 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda);
     CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
 {
     int m;
@@ -163,8 +161,8 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif //defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
index ad7b32dcf..5b0505322 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
@@ -171,13 +171,12 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg)
     }
 #endif
 }
-#endif //!defined(CHAMELEON_SIMULATION)
+
 
 /*
  * Codelet GPU
  */
 #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
 {
     int m;
@@ -228,8 +227,8 @@ static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif //defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
+#endif //!defined(CHAMELEON_SIMULATION)
 
 
 /*
diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
index 275ff280a..fcfd96fc3 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
@@ -119,13 +119,11 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
     CORE_zgetrf_nopiv(m, n, ib, A, lda, &info);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet GPU
  */
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
 {
     int m;
@@ -142,8 +140,8 @@ static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
     CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index 9accae112..705c3341f 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -99,11 +99,9 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
         B, LDB,
         beta, C, LDC);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -143,9 +141,7 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -183,9 +179,9 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 794c474ea..03e9babea 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -96,11 +96,9 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
     CORE_zher2k(uplo, trans,
                 n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -135,9 +133,7 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -171,9 +167,9 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 754b61ff8..385645231 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -90,11 +90,9 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg)
         alpha, A, lda,
         beta, C, ldc);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -132,9 +130,7 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -168,9 +164,9 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c
index 27a888b28..a5272248b 100644
--- a/runtime/starpu/codelets/codelet_zlauum.c
+++ b/runtime/starpu/codelets/codelet_zlauum.c
@@ -73,10 +73,8 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
     CORE_zlauum(uplo, N, A, LDA);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -91,8 +89,8 @@ static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
     cudaThreadSynchronize();
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 13aedc1fb..d0a658443 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -79,10 +79,8 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
     CORE_zpotrf(uplo, n, A, lda, &info);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_MAGMA
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
 {
     cudaStream_t stream[2], currentt_stream;
@@ -115,8 +113,8 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c
index 0bd1944b9..cf766396b 100644
--- a/runtime/starpu/codelets/codelet_zssssm.c
+++ b/runtime/starpu/codelets/codelet_zssssm.c
@@ -170,10 +170,8 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
     CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
 {
     int m1;
@@ -213,8 +211,8 @@ static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 02be8a966..c193f200d 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -99,11 +99,9 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
         B, LDB,
         beta, C, LDC);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -141,9 +139,7 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -181,9 +177,9 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index b07e5456d..e07c22f0d 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -96,11 +96,9 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
     CORE_zsyr2k(uplo, trans,
                  n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -135,9 +133,7 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -171,9 +167,9 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index 6d29911c1..a4c12e551 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -91,11 +91,9 @@ static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg)
         alpha, A, lda,
         beta, C, ldc);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -125,9 +123,7 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -161,9 +157,9 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index f353f6827..bd9787389 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -94,11 +94,9 @@ static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg)
         alpha, A, LDA,
         B, LDB);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -133,9 +131,7 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -171,9 +167,9 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 
 /*
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 8796ca621..9d9138990 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -116,11 +116,9 @@ static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg)
         alpha, A, lda,
         B, ldb);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #ifdef CHAMELEON_USE_CUDA
 #if defined(CHAMELEON_USE_CUBLAS_V2)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -156,9 +154,7 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #else /* CHAMELEON_USE_CUBLAS_V2 */
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -193,9 +189,9 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* CHAMELEON_USE_CUBLAS_V2 */
 #endif /* CHAMELEON_USE_CUDA */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c
index 516a4db4e..f5bfbfe4d 100644
--- a/runtime/starpu/codelets/codelet_ztrtri.c
+++ b/runtime/starpu/codelets/codelet_ztrtri.c
@@ -81,10 +81,8 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
     CORE_ztrtri(uplo, diag, N, A, LDA, &info);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -101,8 +99,8 @@ static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
     cudaThreadSynchronize();
     return;
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
index a9b1fed72..e97dad0d6 100644
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ b/runtime/starpu/codelets/codelet_ztslqt.c
@@ -170,10 +170,8 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + max( m, n );
     CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA) && 0
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -213,8 +211,8 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c
index d582c0d99..cba63b524 100644
--- a/runtime/starpu/codelets/codelet_ztsmlq.c
+++ b/runtime/starpu/codelets/codelet_ztsmlq.c
@@ -212,10 +212,8 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
     CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_CUDA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -262,8 +260,8 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
     cudaStreamSynchronize( stream );
 #endif
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* defined(CHAMELEON_USE_CUDA) */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index 9f9528d7b..d83d1d2f4 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -242,10 +242,9 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
     CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
+
 
 #if defined(CHAMELEON_USE_CUDA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -293,8 +292,8 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
     cudaStreamSynchronize( stream );
 #endif
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif /* defined(CHAMELEON_USE_CUDA) */
+#endif //!defined(CHAMELEON_SIMULATION)
 
 
 /*
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
index 9fdc44931..7cbff9edd 100644
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ b/runtime/starpu/codelets/codelet_ztsqrt.c
@@ -161,10 +161,8 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + max( m, n );
     CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *h_work;
@@ -203,8 +201,9 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
             h_W, d_W, stream);
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
+
 /*
  * Codelet definition
  */
diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c
index cc83c63e8..4ce5e815c 100644
--- a/runtime/starpu/codelets/codelet_ztstrf.c
+++ b/runtime/starpu/codelets/codelet_ztstrf.c
@@ -195,14 +195,13 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg)
     }
 #endif
 }
-#endif //!defined(CHAMELEON_SIMULATION)
+
 
 /*
  * Codelet GPU
  */
 /* TODO/WARNING: tstrf is not working on GPU for now */
 #if defined(CHAMELEON_USE_MAGMA) && 0
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -260,8 +259,8 @@ static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index 4f03d8e70..fac849c0d 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -184,10 +184,8 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg)
     CORE_zunmlq(side, trans, m, n, k, ib,
                 A, lda, T, ldt, C, ldc, WORK, ldwork);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -215,8 +213,8 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index 306e4dc14..b4c14f7b8 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -210,10 +210,8 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg)
     CORE_zunmqr(side, trans, m, n, k, ib,
                 A, lda, T, ldt, C, ldc, WORK, ldwork);
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 
 #if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
 static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -241,8 +239,8 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 
     cudaThreadSynchronize();
 }
-#endif //!defined(CHAMELEON_SIMULATION)
 #endif
+#endif //!defined(CHAMELEON_SIMULATION)
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 0e86da381..d6a2c670b 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -295,37 +295,40 @@ int RUNTIME_desc_getoncpu( MORSE_desc_t *desc )
 
 void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n )
 {
+    int64_t im = m + (desc->i / desc->mb);
+    int64_t jn = n + (desc->j / desc->nb);
+
     starpu_data_handle_t *ptrtile = (starpu_data_handle_t*)(desc->schedopt);
-    ptrtile += ((int64_t)(desc->lmt) * (int64_t)n + (int64_t)m);
+    ptrtile += ((int64_t)(desc->lmt) * (int64_t)jn + (int64_t)im);
 
     if (*ptrtile == NULL) {
         int64_t eltsze = MORSE_Element_Size(desc->dtyp);
         int myrank = desc->myrank;
         int owner  = desc->get_rankof( desc, m, n );
-        int tempmm = (m == desc->lmt-1) ? (desc->lm - m * desc->mb) : desc->mb;
-        int tempnn = (n == desc->lnt-1) ? (desc->ln - n * desc->nb) : desc->nb;
+        int tempmm = (im == desc->lmt-1) ? (desc->lm - im * desc->mb) : desc->mb;
+        int tempnn = (jn == desc->lnt-1) ? (desc->ln - jn * desc->nb) : desc->nb;
 
         if ( myrank == owner ) {
             if ( desc->get_blkaddr(desc, m, n) == (void*)NULL ) {
                 starpu_matrix_data_register(ptrtile, -1,
                                             (uintptr_t) NULL,
-                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+                                            BLKLDD(desc, im), tempmm, tempnn, eltsze);
             }
             else {
                 starpu_matrix_data_register(ptrtile, STARPU_MAIN_RAM,
                                             (uintptr_t)desc->get_blkaddr(desc, m, n),
-                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+                                            BLKLDD(desc, im), tempmm, tempnn, eltsze);
             }
         }
         else {
             starpu_matrix_data_register(ptrtile, -1,
                                         (uintptr_t) NULL,
-                                        BLKLDD(desc, m), tempmm, tempnn, eltsze);
+                                        BLKLDD(desc, im), tempmm, tempnn, eltsze);
         }
 
 #if defined(CHAMELEON_USE_MPI)
         {
-            int64_t block_ind = desc->lmt * n + m;
+            int64_t block_ind = desc->lmt * jn + im;
             starpu_mpi_data_register(*ptrtile, (desc->id << tag_sep) | (block_ind), owner);
         }
 #endif /* defined(CHAMELEON_USE_MPI) */
diff --git a/runtime/starpu/control/runtime_workspace.c b/runtime/starpu/control/runtime_workspace.c
index 711652562..d67f95a82 100644
--- a/runtime/starpu/control/runtime_workspace.c
+++ b/runtime/starpu/control/runtime_workspace.c
@@ -52,14 +52,12 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg)
         }
     }
     else
-    {
 #endif
+    {
         /* This buffer should only be used within the CPU kernel, so
          * there is no point in using pinned memory here. */
         workspace->workspaces[id] = malloc(workspace->size);
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
     }
-#endif
 
     assert(workspace->workspaces[id]);
 }
@@ -87,12 +85,10 @@ static void RUNTIME_free_workspace_on_workers(void *arg)
         }
     }
     else
-    {
 #endif
+    {
         free(workspace->workspaces[id]);
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
     }
-#endif
 
     workspace->workspaces[id] = NULL;
 }
-- 
GitLab