diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c
index 1d284caaab64f41ddbe2e398557a3bf6eded83ff..04786dd800d16e79cff677cbb3a5f06da8373e34 100644
--- a/runtime/openmp/codelets/codelet_zgelqt.c
+++ b/runtime/openmp/codelets/codelet_zgelqt.c
@@ -97,8 +97,10 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
-    CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+    {
+      CHAMELEON_Complex64_t TAU[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
+      CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c
index aea7735dad39f8170b5a8f6ce336bb397de33a23..7b1b8e13ec5ad1b7de9e16eb477f389d93f3b0e4 100644
--- a/runtime/openmp/codelets/codelet_zgeqrt.c
+++ b/runtime/openmp/codelets/codelet_zgeqrt.c
@@ -98,8 +98,10 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
-    CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+    {
+      CHAMELEON_Complex64_t TAU[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
+      CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zgetrf.c b/runtime/openmp/codelets/codelet_zgetrf.c
index 4bb4173ac113566a90c8702cb1d08002142ec0e9..27c599ed16c998e416df0eef00813fac4a64ef0c 100644
--- a/runtime/openmp/codelets/codelet_zgetrf.c
+++ b/runtime/openmp/codelets/codelet_zgetrf.c
@@ -34,6 +34,6 @@ void INSERT_TASK_zgetrf(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     int info = 0;
-#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0])
     CORE_zgetrf( m, n, ptrA, lda, IPIV, &info );
 }
diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c
index 2890651e836f95db86d0035f34fffac83bc5a141..3ed5263e649f2fedfd4d17b345cee38e8f957d3e 100644
--- a/runtime/openmp/codelets/codelet_zherfb.c
+++ b/runtime/openmp/codelets/codelet_zherfb.c
@@ -35,7 +35,9 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlange.c b/runtime/openmp/codelets/codelet_zlange.c
index 1358fc57d2216dc6cf44a9adb27357b23c711499..5c5c99dd99778deba5884a6fdb83a22c9192ef9b 100644
--- a/runtime/openmp/codelets/codelet_zlange.c
+++ b/runtime/openmp/codelets/codelet_zlange.c
@@ -33,9 +33,11 @@ void INSERT_TASK_zlange(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options, work) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn])
-    CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB);
+#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options) depend(in:ptrA[0]) depend(inout:ptrB[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB);
+    }
 }
 
 void INSERT_TASK_zlange_max(const RUNTIME_option_t *options,
diff --git a/runtime/openmp/codelets/codelet_zlanhe.c b/runtime/openmp/codelets/codelet_zlanhe.c
index f77acaad28dffa0abecc17a0f9308d009c0fea36..be4bc57f354ac7c1d4d0b045c88026458583b608 100644
--- a/runtime/openmp/codelets/codelet_zlanhe.c
+++ b/runtime/openmp/codelets/codelet_zlanhe.c
@@ -32,8 +32,10 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options,
                        const CHAM_desc_t *B, int Bm, int Bn)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-    double *work = options->ws_worker;
     double *normA = RTBLKADDR(B, double, Bm, Bn);
-#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn])
-    CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA);
+#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlansy.c b/runtime/openmp/codelets/codelet_zlansy.c
index c3dd736faac2e267f1703ea63e29633d45dd5b02..3118f450f5f8bc2e88c6af7aefaa59553b81c9d7 100644
--- a/runtime/openmp/codelets/codelet_zlansy.c
+++ b/runtime/openmp/codelets/codelet_zlansy.c
@@ -32,8 +32,10 @@ void INSERT_TASK_zlansy(const RUNTIME_option_t *options,
                        const CHAM_desc_t *B, int Bm, int Bn)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-    double *work = options->ws_worker;
     double *normA = RTBLKADDR(B, double, Bm, Bn);
-#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn])
-    CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA);
+#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c
index 00f1c3b7ded78b156d9370d20dff2b68fc8418bc..6b5ae11687980f9d0743245ba341ff15afea635e 100644
--- a/runtime/openmp/codelets/codelet_zlantr.c
+++ b/runtime/openmp/codelets/codelet_zlantr.c
@@ -32,7 +32,9 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_worker;
-#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
-    CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
+#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c
index 783a610a555be53315333fc9f8019f9d48436eca..b587f5d5ecfa1fcc0a951a1435169c51986fbfc6 100644
--- a/runtime/openmp/codelets/codelet_ztplqt.c
+++ b/runtime/openmp/codelets/codelet_ztplqt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0])
-    CORE_ztplqt( M, N, L, ib,
-                 ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztplqt( M, N, L, ib,
+                   ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c
index 769c66194b93c0cea3769b8671d2849fc242cd0c..3601df373d6f6b3691c2fa6b6624edad754a6a69 100644
--- a/runtime/openmp/codelets/codelet_ztpmlqt.c
+++ b/runtime/openmp/codelets/codelet_ztpmlqt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
-    CORE_ztpmlqt( side, trans, M, N, K, L, ib,
-                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztpmlqt( side, trans, M, N, K, L, ib,
+                    ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c
index 526017942fd197be7d181500cf9206486fa58fed..3d5225a9ea4e01c45019bad52381622cfe2ace53 100644
--- a/runtime/openmp/codelets/codelet_ztpmqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpmqrt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
     CORE_ztpmqrt( side, trans, M, N, K, L, ib,
-                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, tmp );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c
index 26dd08848c727fb61e3d0e5d7cf92f05378382b6..f25eb1684b2ecb05838eef5a1b63c144a8ba2c57 100644
--- a/runtime/openmp/codelets/codelet_ztpqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpqrt.c
@@ -29,8 +29,10 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
-    CORE_ztpqrt( M, N, L, ib,
-                 ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
+      CORE_ztpqrt( M, N, L, ib,
+          ptrA, lda, ptrB, ldb, ptrT, ldt, tmp );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c
index d17db69228fb365eb47b74d80577c17d22965e0d..3d9b75e9c7b57d64a3be545bdce3abed3fe29599 100644
--- a/runtime/openmp/codelets/codelet_ztslqt.c
+++ b/runtime/openmp/codelets/codelet_ztslqt.c
@@ -109,8 +109,10 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
-    CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmlq.c b/runtime/openmp/codelets/codelet_ztsmlq.c
index 83611cf1f5795f82a6536fe1b6ce4b950337acd2..6b11db9fb8a5ba99f7f541b9ed3d0a2437774741 100644
--- a/runtime/openmp/codelets/codelet_ztsmlq.c
+++ b/runtime/openmp/codelets/codelet_ztsmlq.c
@@ -140,9 +140,11 @@ void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
index 57071465d08e31024798e7fdf2e303ac1b8c7723..2ba3f99f559de7866071c7153e9ad733c3d4abaa 100644
--- a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
+++ b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
@@ -39,9 +39,11 @@ void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
-                       ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
+                         ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmqr.c b/runtime/openmp/codelets/codelet_ztsmqr.c
index e6beea2243ab1445f7641e4897769d5d8e251d01..b75e91f3d59f133acdfe7d8fe89a0f194dffecf1 100644
--- a/runtime/openmp/codelets/codelet_ztsmqr.c
+++ b/runtime/openmp/codelets/codelet_ztsmqr.c
@@ -140,9 +140,11 @@ void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
index 9a1fe799e6b38aae04256a0a569a882b23d73ff0..6ba9d43310730f9f6a77ea1fbc6cac1c2cc04cd9 100644
--- a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
+++ b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
@@ -39,9 +39,11 @@ void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k,
-                       ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k,
+                         ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c
index b7561d8a965201923e33a3ece17052b630e06a84..8ee1cba55d9e8bc9b58f5ac430150490023ac68a 100644
--- a/runtime/openmp/codelets/codelet_ztsqrt.c
+++ b/runtime/openmp/codelets/codelet_ztsqrt.c
@@ -98,8 +98,10 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
-    CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztstrf.c b/runtime/openmp/codelets/codelet_ztstrf.c
index 2748674e5aee0fda795eb34ba3cf1a271fdb01f8..75d2920ce7d378e559ad6d2d9b42d7e079203f95 100644
--- a/runtime/openmp/codelets/codelet_ztstrf.c
+++ b/runtime/openmp/codelets/codelet_ztstrf.c
@@ -104,7 +104,9 @@ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrU = RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un);
     CHAMELEON_Complex64_t *ptrL = RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0])
-    CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo);
+#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c
index c8567b1f18afd6614dbf6b0dafaf47af55f22549..fb37bdfaa48351868f59cf54cf4e83e2eec4652b 100644
--- a/runtime/openmp/codelets/codelet_zttlqt.c
+++ b/runtime/openmp/codelets/codelet_zttlqt.c
@@ -110,8 +110,10 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
-    CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttmlq.c b/runtime/openmp/codelets/codelet_zttmlq.c
index e5093489c58e6a1811b388da81de89d180b72103..687131351f8c388503dda5ad8e78c6a69de2d87e 100644
--- a/runtime/openmp/codelets/codelet_zttmlq.c
+++ b/runtime/openmp/codelets/codelet_zttmlq.c
@@ -132,9 +132,11 @@ void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1,
-                ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1,
+                  ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttmqr.c b/runtime/openmp/codelets/codelet_zttmqr.c
index b28b47eb31543b4dd2aa4323043a5ec814fce3c8..7a2a10d6815f253279b2237b81ff4632e6e9535e 100644
--- a/runtime/openmp/codelets/codelet_zttmqr.c
+++ b/runtime/openmp/codelets/codelet_zttmqr.c
@@ -138,9 +138,11 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zunmlq.c b/runtime/openmp/codelets/codelet_zunmlq.c
index fc16b5e13d3ab386940b562c319f28c5ca7e16dc..c9852c457d06b93d9eda19438da8ba339188d64d 100644
--- a/runtime/openmp/codelets/codelet_zunmlq.c
+++ b/runtime/openmp/codelets/codelet_zunmlq.c
@@ -121,8 +121,10 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zunmlq(side, trans, m, n, k, ib,
-                ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zunmlq(side, trans, m, n, k, ib,
+                  ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zunmqr.c b/runtime/openmp/codelets/codelet_zunmqr.c
index 207469b5d2b08e157f9cd2d2346d5195465ef90a..1254dbec58bbb8fb79d5df2320fa94039244e4fe 100644
--- a/runtime/openmp/codelets/codelet_zunmqr.c
+++ b/runtime/openmp/codelets/codelet_zunmqr.c
@@ -121,8 +121,10 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zunmqr(side, trans, m, n, k, ib,
-                ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
+      CORE_zunmqr(side, trans, m, n, k, ib,
+          ptrA, lda, ptrT, ldt, ptrC, ldc, tmp, nb);
+    }
 }
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index ca81fd63c96644d4a533e6b72770ac0803d5e822..81e2cd206b6f5f8888ffe8c4115e93723a2ecbd3 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -46,8 +46,10 @@ void RUNTIME_options_finalize( RUNTIME_option_t *option, CHAM_context_t *chamctx
 int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, size_t host_size )
 {
     if (worker_size > 0) {
-        // TODO used for scratch, maybe we can do better than malloc
-        options->ws_worker = malloc(worker_size* sizeof(char));
+        // NOTE: we set the size, but instead of doing a malloc shared by multiple workers,
+        // we just create a VLA in the relevant codelets, within the task's body.
+        // This way we ensure the "scratch" is thread local and not shared by multiple threads.
+        options->ws_worker = NULL;
         options->ws_wsize = worker_size;
     }
     // FIXME: handle ws_host if needed for omp target