From 2c0e30038c80f178e9855fc4a7f2d9364c4c1ab9 Mon Sep 17 00:00:00 2001
From: Philippe Virouleau <philippe.44@gmail.com>
Date: Fri, 16 Nov 2018 15:06:01 +0100
Subject: [PATCH] Various fixes for QR.

  - only use ws_worker for scratch in codelet
  - add taskwait before freeing the scratch
---
 control/control.c                         |  6 ++++++
 include/chameleon.h                       | 12 +++++++++---
 runtime/openmp/codelets/codelet_zgelqt.c  |  4 ++--
 runtime/openmp/codelets/codelet_zgeqrt.c  |  4 ++--
 runtime/openmp/codelets/codelet_zherfb.c  |  4 ++--
 runtime/openmp/codelets/codelet_zlacpy.c  |  2 +-
 runtime/openmp/codelets/codelet_zlantr.c  |  2 +-
 runtime/openmp/codelets/codelet_zlaset.c  |  2 +-
 runtime/openmp/codelets/codelet_zplrnt.c  |  2 +-
 runtime/openmp/codelets/codelet_ztplqt.c  |  4 ++--
 runtime/openmp/codelets/codelet_ztpmlqt.c |  2 +-
 runtime/openmp/codelets/codelet_ztpmqrt.c |  2 +-
 runtime/openmp/codelets/codelet_ztpqrt.c  |  2 +-
 runtime/openmp/codelets/codelet_ztslqt.c  |  4 ++--
 runtime/openmp/codelets/codelet_ztsqrt.c  |  4 ++--
 runtime/openmp/codelets/codelet_zttlqt.c  |  4 ++--
 runtime/openmp/codelets/codelet_zttqrt.c  |  4 ++--
 runtime/openmp/control/runtime_options.c  | 13 ++++---------
 timing/timing.c                           | 12 +++++-------
 19 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/control/control.c b/control/control.c
index c0657726e..b34938963 100644
--- a/control/control.c
+++ b/control/control.c
@@ -50,6 +50,12 @@
  *          \retval CHAMELEON_SUCCESS successful exit
  *
  */
+#ifdef CHAMELEON_Init
+#undef CHAMELEON_Init
+#endif
+#ifdef CHAMELEON_Finalize
+#undef CHAMELEON_Finalize
+#endif
 int CHAMELEON_Init(int cores, int gpus)
 {
     return CHAMELEON_InitPar(cores, gpus, -1);
diff --git a/include/chameleon.h b/include/chameleon.h
index a8afc08c8..9166a88e1 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -137,12 +137,18 @@ int CHAMELEON_Sequence_Wait    (RUNTIME_sequence_t *sequence);
 
 #if defined(CHAMELEON_SCHED_OPENMP)
 #define CHAMELEON_INIT(nworkers, ncudas)\
-  CHAMELEON_Init(nworkers, ncudas);\
-  _Pragma("omp parallel")\
-  _Pragma("omp master")
+    CHAMELEON_Init(nworkers, ncudas);\
+    _Pragma("omp parallel")\
+    _Pragma("omp master")\
+    {
+#define CHAMELEON_FINALIZE()\
+    }\
+    CHAMELEON_Finalize();
 #else
 #define CHAMELEON_INIT(nworkers, ncudas)\
   CHAMELEON_Init(nworkers, ncudas);
+#define CHAMELEON_FINALIZE()\
+  CHAMELEON_Finalize();
 #endif
 
 END_C_DECLS
diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c
index 8c69936b7..1d284caaa 100644
--- a/runtime/openmp/codelets/codelet_zgelqt.c
+++ b/runtime/openmp/codelets/codelet_zgelqt.c
@@ -98,7 +98,7 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
     CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c
index 0337e0de9..aea7735da 100644
--- a/runtime/openmp/codelets/codelet_zgeqrt.c
+++ b/runtime/openmp/codelets/codelet_zgeqrt.c
@@ -99,7 +99,7 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
     CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c
index 1531406b2..2890651e8 100644
--- a/runtime/openmp/codelets/codelet_zherfb.c
+++ b/runtime/openmp/codelets/codelet_zherfb.c
@@ -35,7 +35,7 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0:Am*An], ptrT[0:Tm*Tn]) depend(inout:ptrC[0:Cm*Cn])
+    CHAMELEON_Complex64_t *work = options->ws_worker;
+#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
     CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
 }
diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c
index a6ab833af..4c8a2c147 100644
--- a/runtime/openmp/codelets/codelet_zlacpy.c
+++ b/runtime/openmp/codelets/codelet_zlacpy.c
@@ -40,7 +40,7 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A + displA, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B + displB, CHAMELEON_Complex64_t, Bm, Bn);
-#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn])
+#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0])
     CORE_zlacpy(uplo, m, n, ptrA, lda, ptrB, ldb);
 }
 
diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c
index 08db23b53..00f1c3b7d 100644
--- a/runtime/openmp/codelets/codelet_zlantr.c
+++ b/runtime/openmp/codelets/codelet_zlantr.c
@@ -32,7 +32,7 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_host;
+    double *work = options->ws_worker;
 #pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
     CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
 }
diff --git a/runtime/openmp/codelets/codelet_zlaset.c b/runtime/openmp/codelets/codelet_zlaset.c
index 1dbc2e48f..5f74f186d 100644
--- a/runtime/openmp/codelets/codelet_zlaset.c
+++ b/runtime/openmp/codelets/codelet_zlaset.c
@@ -69,6 +69,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options,
                        const CHAM_desc_t *A, int Am, int An, int LDA)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0])
     CORE_zlaset(uplo, M, N, alpha, beta, ptrA, LDA);
 }
diff --git a/runtime/openmp/codelets/codelet_zplrnt.c b/runtime/openmp/codelets/codelet_zplrnt.c
index fde7a8d2c..ce6b6525a 100644
--- a/runtime/openmp/codelets/codelet_zplrnt.c
+++ b/runtime/openmp/codelets/codelet_zplrnt.c
@@ -35,6 +35,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                         int bigM, int m0, int n0, unsigned long long int seed )
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0])
     CORE_zplrnt( m, n, ptrA, lda, bigM, m0, n0, seed );
 }
diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c
index 2f37931e6..783a610a5 100644
--- a/runtime/openmp/codelets/codelet_ztplqt.c
+++ b/runtime/openmp/codelets/codelet_ztplqt.c
@@ -30,8 +30,8 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0:Am*An], ptrB[0:Bm*Bn], ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = options->ws_worker;
+#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0])
     CORE_ztplqt( M, N, L, ib,
                  ptrA, lda, ptrB, ldb, ptrT, ldt, work );
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c
index 3746c3041..769c66194 100644
--- a/runtime/openmp/codelets/codelet_ztpmlqt.c
+++ b/runtime/openmp/codelets/codelet_ztpmlqt.c
@@ -30,7 +30,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpmlqt( side, trans, M, N, K, L, ib,
                   ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c
index a5a42d95c..526017942 100644
--- a/runtime/openmp/codelets/codelet_ztpmqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpmqrt.c
@@ -30,7 +30,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpmqrt( side, trans, M, N, K, L, ib,
                   ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c
index 8930bc979..26dd08848 100644
--- a/runtime/openmp/codelets/codelet_ztpqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpqrt.c
@@ -29,7 +29,7 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpqrt( M, N, L, ib,
                  ptrA, lda, ptrB, ldb, ptrT, ldt, work );
diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c
index 9a3b6db7b..d17db6922 100644
--- a/runtime/openmp/codelets/codelet_ztslqt.c
+++ b/runtime/openmp/codelets/codelet_ztslqt.c
@@ -109,8 +109,8 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c
index bc16fb146..b7561d8a9 100644
--- a/runtime/openmp/codelets/codelet_ztsqrt.c
+++ b/runtime/openmp/codelets/codelet_ztsqrt.c
@@ -98,8 +98,8 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
     CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c
index e693c6b7a..c8567b1f1 100644
--- a/runtime/openmp/codelets/codelet_zttlqt.c
+++ b/runtime/openmp/codelets/codelet_zttlqt.c
@@ -110,8 +110,8 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zttqrt.c b/runtime/openmp/codelets/codelet_zttqrt.c
index 5061ef3ba..f01fff932 100644
--- a/runtime/openmp/codelets/codelet_zttqrt.c
+++ b/runtime/openmp/codelets/codelet_zttqrt.c
@@ -110,8 +110,8 @@ void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_zttqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index 4511f102b..ca81fd63c 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -50,23 +50,18 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz
         options->ws_worker = malloc(worker_size* sizeof(char));
         options->ws_wsize = worker_size;
     }
-    if (host_size > 0) {
-        // TODO used for scratch, maybe we can do better than malloc
-        options->ws_host = malloc(host_size * sizeof(char));
-        options->ws_hsize = host_size;
-    }
+    // FIXME: handle ws_host if needed for omp target
     return CHAMELEON_SUCCESS;
 }
 
 int RUNTIME_options_ws_free( RUNTIME_option_t *options )
 {
     if (options->ws_wsize) {
+        // This one is not trivial: the free should be submitted as a task which depends
+        // on existing task using scratch, but we don't have a dependency for this, so we sync.
+#pragma omp taskwait
         free(options->ws_worker);
         options->ws_wsize = 0;
     }
-    if (options->ws_hsize) {
-        free(options->ws_host);
-        options->ws_hsize = 0;
-    }
     return CHAMELEON_SUCCESS;
 }
diff --git a/timing/timing.c b/timing/timing.c
index 4afdeac80..51eddd529 100644
--- a/timing/timing.c
+++ b/timing/timing.c
@@ -751,15 +751,13 @@ main(int argc, char *argv[]) {
     int return_code;
 
     /* Initialize CHAMELEON */
-    /* NOTE: do *NOT* add a ';' at the end of this call, as it may be a #pragma omp parallel */
     CHAMELEON_INIT( iparam[IPARAM_THRDNBR],
-                iparam[IPARAM_NCUDAS] )
+                    iparam[IPARAM_NCUDAS] );
+    // NOTE: OpenMP needs this, as Chameleon's init/finalize add '{'/'}',
+    // and 'return' is not allowed in parallel regions.
+    return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step);
 
-    {
-        return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step);
-    }
-
-    CHAMELEON_Finalize();
+    CHAMELEON_FINALIZE();
     return return_code;
 }
 
-- 
GitLab