diff --git a/control/control.c b/control/control.c
index c0657726ed1a9eb3e67bcef408ed68aa4f46838e..b349389630307ac6285c7b6ca73c9c9248c0c6cd 100644
--- a/control/control.c
+++ b/control/control.c
@@ -50,6 +50,12 @@
  *          \retval CHAMELEON_SUCCESS successful exit
  *
  */
+#ifdef CHAMELEON_Init
+#undef CHAMELEON_Init
+#endif
+#ifdef CHAMELEON_Finalize
+#undef CHAMELEON_Finalize
+#endif
 int CHAMELEON_Init(int cores, int gpus)
 {
     return CHAMELEON_InitPar(cores, gpus, -1);
diff --git a/include/chameleon.h b/include/chameleon.h
index a8afc08c83ccf70d51e418887499c28d4008e10c..9166a88e14239fe6b504db0d67af998d98bee946 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -137,12 +137,18 @@ int CHAMELEON_Sequence_Wait    (RUNTIME_sequence_t *sequence);
 
 #if defined(CHAMELEON_SCHED_OPENMP)
 #define CHAMELEON_INIT(nworkers, ncudas)\
-  CHAMELEON_Init(nworkers, ncudas);\
-  _Pragma("omp parallel")\
-  _Pragma("omp master")
+    CHAMELEON_Init(nworkers, ncudas);\
+    _Pragma("omp parallel")\
+    _Pragma("omp master")\
+    {
+#define CHAMELEON_FINALIZE()\
+    }\
+    CHAMELEON_Finalize();
 #else
 #define CHAMELEON_INIT(nworkers, ncudas)\
   CHAMELEON_Init(nworkers, ncudas);
+#define CHAMELEON_FINALIZE()\
+  CHAMELEON_Finalize();
 #endif
 
 END_C_DECLS
diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c
index 8c69936b7b52e65b6dfb2a54d3e2279c36e29f27..1d284caaab64f41ddbe2e398557a3bf6eded83ff 100644
--- a/runtime/openmp/codelets/codelet_zgelqt.c
+++ b/runtime/openmp/codelets/codelet_zgelqt.c
@@ -98,7 +98,7 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
     CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c
index 0337e0de946f5323c665d50bef0529b3d89eeceb..aea7735dad39f8170b5a8f6ce336bb397de33a23 100644
--- a/runtime/openmp/codelets/codelet_zgeqrt.c
+++ b/runtime/openmp/codelets/codelet_zgeqrt.c
@@ -99,7 +99,7 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
     CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c
index 1531406b22a42ba5beb0927f56f7581864bde586..2890651e836f95db86d0035f34fffac83bc5a141 100644
--- a/runtime/openmp/codelets/codelet_zherfb.c
+++ b/runtime/openmp/codelets/codelet_zherfb.c
@@ -35,7 +35,7 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0:Am*An], ptrT[0:Tm*Tn]) depend(inout:ptrC[0:Cm*Cn])
+    CHAMELEON_Complex64_t *work = options->ws_worker;
+#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
     CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
 }
diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c
index a6ab833afd2b85e392cf6299e6fdd8e1148d8d14..4c8a2c1479b9be6e9d372346898288c57df934b6 100644
--- a/runtime/openmp/codelets/codelet_zlacpy.c
+++ b/runtime/openmp/codelets/codelet_zlacpy.c
@@ -40,7 +40,7 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A + displA, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B + displB, CHAMELEON_Complex64_t, Bm, Bn);
-#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn])
+#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0])
     CORE_zlacpy(uplo, m, n, ptrA, lda, ptrB, ldb);
 }
 
diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c
index 08db23b539612f9c4d2c70c9a6b1d1564731a680..00f1c3b7ded78b156d9370d20dff2b68fc8418bc 100644
--- a/runtime/openmp/codelets/codelet_zlantr.c
+++ b/runtime/openmp/codelets/codelet_zlantr.c
@@ -32,7 +32,7 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_host;
+    double *work = options->ws_worker;
 #pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
     CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
 }
diff --git a/runtime/openmp/codelets/codelet_zlaset.c b/runtime/openmp/codelets/codelet_zlaset.c
index 1dbc2e48f25945190ea31fe5c013ca59052a8802..5f74f186d5a7a6ab2569adf7a247376e94635807 100644
--- a/runtime/openmp/codelets/codelet_zlaset.c
+++ b/runtime/openmp/codelets/codelet_zlaset.c
@@ -69,6 +69,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options,
                        const CHAM_desc_t *A, int Am, int An, int LDA)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0])
     CORE_zlaset(uplo, M, N, alpha, beta, ptrA, LDA);
 }
diff --git a/runtime/openmp/codelets/codelet_zplrnt.c b/runtime/openmp/codelets/codelet_zplrnt.c
index fde7a8d2c7183495175b7fd1a1ed7b55ea087b77..ce6b6525a4a256c8bdfd9382d032442220f37bf3 100644
--- a/runtime/openmp/codelets/codelet_zplrnt.c
+++ b/runtime/openmp/codelets/codelet_zplrnt.c
@@ -35,6 +35,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                         int bigM, int m0, int n0, unsigned long long int seed )
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0])
     CORE_zplrnt( m, n, ptrA, lda, bigM, m0, n0, seed );
 }
diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c
index 2f37931e67737f7b7ca766f43830b1825714679b..783a610a555be53315333fc9f8019f9d48436eca 100644
--- a/runtime/openmp/codelets/codelet_ztplqt.c
+++ b/runtime/openmp/codelets/codelet_ztplqt.c
@@ -30,8 +30,8 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
-#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0:Am*An], ptrB[0:Bm*Bn], ptrT[0:Tm*Tn])
+    CHAMELEON_Complex64_t *work = options->ws_worker;
+#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0])
     CORE_ztplqt( M, N, L, ib,
                  ptrA, lda, ptrB, ldb, ptrT, ldt, work );
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c
index 3746c30415d0896390263a373fffe51a1e04c372..769c66194b93c0cea3769b8671d2849fc242cd0c 100644
--- a/runtime/openmp/codelets/codelet_ztpmlqt.c
+++ b/runtime/openmp/codelets/codelet_ztpmlqt.c
@@ -30,7 +30,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpmlqt( side, trans, M, N, K, L, ib,
                   ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c
index a5a42d95c47e13e1edfc0f423df27a74759ecff8..526017942fd197be7d181500cf9206486fa58fed 100644
--- a/runtime/openmp/codelets/codelet_ztpmqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpmqrt.c
@@ -30,7 +30,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpmqrt( side, trans, M, N, K, L, ib,
                   ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c
index 8930bc9797ba22e4f0c5d18dfe4e632e5f4eb2c6..26dd08848c727fb61e3d0e5d7cf92f05378382b6 100644
--- a/runtime/openmp/codelets/codelet_ztpqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpqrt.c
@@ -29,7 +29,7 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_host;
+    CHAMELEON_Complex64_t *work = options->ws_worker;
 #pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
     CORE_ztpqrt( M, N, L, ib,
                  ptrA, lda, ptrB, ldb, ptrT, ldt, work );
diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c
index 9a3b6db7bcc567933b090319837a9ad5d37ad17f..d17db69228fb365eb47b74d80577c17d22965e0d 100644
--- a/runtime/openmp/codelets/codelet_ztslqt.c
+++ b/runtime/openmp/codelets/codelet_ztslqt.c
@@ -109,8 +109,8 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c
index bc16fb146e1bd534c16489c2621e69958c43d5e0..b7561d8a965201923e33a3ece17052b630e06a84 100644
--- a/runtime/openmp/codelets/codelet_ztsqrt.c
+++ b/runtime/openmp/codelets/codelet_ztsqrt.c
@@ -98,8 +98,8 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
     CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c
index e693c6b7a19f57c182e3828b178ae60500579380..c8567b1f18afd6614dbf6b0dafaf47af55f22549 100644
--- a/runtime/openmp/codelets/codelet_zttlqt.c
+++ b/runtime/openmp/codelets/codelet_zttlqt.c
@@ -110,8 +110,8 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/codelets/codelet_zttqrt.c b/runtime/openmp/codelets/codelet_zttqrt.c
index 5061ef3babfe8d1097b22c25de68f3a2af7e4f7c..f01fff932338acc2e8da0339979549bad22ad1e0 100644
--- a/runtime/openmp/codelets/codelet_zttqrt.c
+++ b/runtime/openmp/codelets/codelet_zttqrt.c
@@ -110,8 +110,8 @@ void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-    CHAMELEON_Complex64_t *tau = options->ws_host;
+    CHAMELEON_Complex64_t *tau = options->ws_worker;
+    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
 #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
     CORE_zttqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
 }
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index 4511f102b95b67ece873ca3e4ff94129f6dbb4fc..ca81fd63c96644d4a533e6b72770ac0803d5e822 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -50,23 +50,18 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz
         options->ws_worker = malloc(worker_size* sizeof(char));
         options->ws_wsize = worker_size;
     }
-    if (host_size > 0) {
-        // TODO used for scratch, maybe we can do better than malloc
-        options->ws_host = malloc(host_size * sizeof(char));
-        options->ws_hsize = host_size;
-    }
+    // FIXME: handle ws_host if needed for omp target
     return CHAMELEON_SUCCESS;
 }
 
 int RUNTIME_options_ws_free( RUNTIME_option_t *options )
 {
     if (options->ws_wsize) {
+        // This one is not trivial: the free should be submitted as a task which depends
+        // on existing task using scratch, but we don't have a dependency for this, so we sync.
+#pragma omp taskwait
         free(options->ws_worker);
         options->ws_wsize = 0;
     }
-    if (options->ws_hsize) {
-        free(options->ws_host);
-        options->ws_hsize = 0;
-    }
     return CHAMELEON_SUCCESS;
 }
diff --git a/timing/timing.c b/timing/timing.c
index 4afdeac80922f5ee3ea4328c790e5cb07608a0c6..51eddd529510783b5df12547f29a540022e7dd2a 100644
--- a/timing/timing.c
+++ b/timing/timing.c
@@ -751,15 +751,13 @@ main(int argc, char *argv[]) {
     int return_code;
 
     /* Initialize CHAMELEON */
-    /* NOTE: do *NOT* add a ';' at the end of this call, as it may be a #pragma omp parallel */
     CHAMELEON_INIT( iparam[IPARAM_THRDNBR],
-                iparam[IPARAM_NCUDAS] )
+                    iparam[IPARAM_NCUDAS] );
+    // NOTE: OpenMP needs this, as Chameleon's init/finalize add '{'/'}',
+    // and 'return' is not allowed in parallel regions.
+    return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step);
 
-    {
-        return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step);
-    }
-
-    CHAMELEON_Finalize();
+    CHAMELEON_FINALIZE();
     return return_code;
 }