From 5ca2d21c710d3f13875dd03dba1e5c20c1124f2d Mon Sep 17 00:00:00 2001
From: Philippe Virouleau <philippe.44@gmail.com>
Date: Wed, 5 Dec 2018 14:41:17 +0100
Subject: [PATCH] Use VLA instead of malloc for scratch

---
 runtime/openmp/codelets/codelet_zgelqt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zgeqrt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zgetrf.c        |  2 +-
 runtime/openmp/codelets/codelet_zherfb.c        |  8 +++++---
 runtime/openmp/codelets/codelet_zlange.c        |  8 +++++---
 runtime/openmp/codelets/codelet_zlanhe.c        |  8 +++++---
 runtime/openmp/codelets/codelet_zlansy.c        |  8 +++++---
 runtime/openmp/codelets/codelet_zlantr.c        |  8 +++++---
 runtime/openmp/codelets/codelet_ztplqt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztpmlqt.c       | 10 ++++++----
 runtime/openmp/codelets/codelet_ztpmqrt.c       |  8 +++++---
 runtime/openmp/codelets/codelet_ztpqrt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztslqt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztsmlq.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztsmlq_hetra1.c | 10 ++++++----
 runtime/openmp/codelets/codelet_ztsmqr.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztsmqr_hetra1.c | 10 ++++++----
 runtime/openmp/codelets/codelet_ztsqrt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_ztstrf.c        |  8 +++++---
 runtime/openmp/codelets/codelet_zttlqt.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zttmlq.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zttmqr.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zunmlq.c        | 10 ++++++----
 runtime/openmp/codelets/codelet_zunmqr.c        | 10 ++++++----
 runtime/openmp/control/runtime_options.c        |  6 ++++--
 25 files changed, 136 insertions(+), 88 deletions(-)

diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c
index 1d284caaa..04786dd80 100644
--- a/runtime/openmp/codelets/codelet_zgelqt.c
+++ b/runtime/openmp/codelets/codelet_zgelqt.c
@@ -97,8 +97,10 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
-    CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+    {
+      CHAMELEON_Complex64_t TAU[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
+      CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c
index aea7735da..7b1b8e13e 100644
--- a/runtime/openmp/codelets/codelet_zgeqrt.c
+++ b/runtime/openmp/codelets/codelet_zgeqrt.c
@@ -98,8 +98,10 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *TAU = options->ws_worker;
-    CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
-#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
-    CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
+    {
+      CHAMELEON_Complex64_t TAU[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
+      CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zgetrf.c b/runtime/openmp/codelets/codelet_zgetrf.c
index 4bb4173ac..27c599ed1 100644
--- a/runtime/openmp/codelets/codelet_zgetrf.c
+++ b/runtime/openmp/codelets/codelet_zgetrf.c
@@ -34,6 +34,6 @@ void INSERT_TASK_zgetrf(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     int info = 0;
-#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0:Am*An])
+#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0])
     CORE_zgetrf( m, n, ptrA, lda, IPIV, &info );
 }
diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c
index 2890651e8..3ed5263e6 100644
--- a/runtime/openmp/codelets/codelet_zherfb.c
+++ b/runtime/openmp/codelets/codelet_zherfb.c
@@ -35,7 +35,9 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlange.c b/runtime/openmp/codelets/codelet_zlange.c
index 1358fc57d..5c5c99dd9 100644
--- a/runtime/openmp/codelets/codelet_zlange.c
+++ b/runtime/openmp/codelets/codelet_zlange.c
@@ -33,9 +33,11 @@ void INSERT_TASK_zlange(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options, work) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn])
-    CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB);
+#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options) depend(in:ptrA[0]) depend(inout:ptrB[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB);
+    }
 }
 
 void INSERT_TASK_zlange_max(const RUNTIME_option_t *options,
diff --git a/runtime/openmp/codelets/codelet_zlanhe.c b/runtime/openmp/codelets/codelet_zlanhe.c
index f77acaad2..be4bc57f3 100644
--- a/runtime/openmp/codelets/codelet_zlanhe.c
+++ b/runtime/openmp/codelets/codelet_zlanhe.c
@@ -32,8 +32,10 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options,
                        const CHAM_desc_t *B, int Bm, int Bn)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-    double *work = options->ws_worker;
     double *normA = RTBLKADDR(B, double, Bm, Bn);
-#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn])
-    CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA);
+#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlansy.c b/runtime/openmp/codelets/codelet_zlansy.c
index c3dd736fa..3118f450f 100644
--- a/runtime/openmp/codelets/codelet_zlansy.c
+++ b/runtime/openmp/codelets/codelet_zlansy.c
@@ -32,8 +32,10 @@ void INSERT_TASK_zlansy(const RUNTIME_option_t *options,
                        const CHAM_desc_t *B, int Bm, int Bn)
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-    double *work = options->ws_worker;
     double *normA = RTBLKADDR(B, double, Bm, Bn);
-#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn])
-    CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA);
+#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c
index 00f1c3b7d..6b5ae1168 100644
--- a/runtime/openmp/codelets/codelet_zlantr.c
+++ b/runtime/openmp/codelets/codelet_zlantr.c
@@ -32,7 +32,9 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options,
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrB = RTBLKADDR(B, double, Bm, Bn);
-    double *work = options->ws_worker;
-#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
-    CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
+#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
+    {
+      double work[options->ws_wsize];
+      CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c
index 783a610a5..b587f5d5e 100644
--- a/runtime/openmp/codelets/codelet_ztplqt.c
+++ b/runtime/openmp/codelets/codelet_ztplqt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0])
-    CORE_ztplqt( M, N, L, ib,
-                 ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztplqt( M, N, L, ib,
+                   ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c
index 769c66194..3601df373 100644
--- a/runtime/openmp/codelets/codelet_ztpmlqt.c
+++ b/runtime/openmp/codelets/codelet_ztpmlqt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
-    CORE_ztpmlqt( side, trans, M, N, K, L, ib,
-                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztpmlqt( side, trans, M, N, K, L, ib,
+                    ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c
index 526017942..3d5225a9e 100644
--- a/runtime/openmp/codelets/codelet_ztpmqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpmqrt.c
@@ -30,8 +30,10 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
     CORE_ztpmqrt( side, trans, M, N, K, L, ib,
-                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
+                  ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, tmp );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c
index 26dd08848..f25eb1684 100644
--- a/runtime/openmp/codelets/codelet_ztpqrt.c
+++ b/runtime/openmp/codelets/codelet_ztpqrt.c
@@ -29,8 +29,10 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
-    CORE_ztpqrt( M, N, L, ib,
-                 ptrA, lda, ptrB, ldb, ptrT, ldt, work );
+#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
+      CORE_ztpqrt( M, N, L, ib,
+          ptrA, lda, ptrB, ldb, ptrT, ldt, tmp );
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c
index d17db6922..3d9b75e9c 100644
--- a/runtime/openmp/codelets/codelet_ztslqt.c
+++ b/runtime/openmp/codelets/codelet_ztslqt.c
@@ -109,8 +109,10 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
-    CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmlq.c b/runtime/openmp/codelets/codelet_ztsmlq.c
index 83611cf1f..6b11db9fb 100644
--- a/runtime/openmp/codelets/codelet_ztsmlq.c
+++ b/runtime/openmp/codelets/codelet_ztsmlq.c
@@ -140,9 +140,11 @@ void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
index 57071465d..2ba3f99f5 100644
--- a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
+++ b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c
@@ -39,9 +39,11 @@ void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
-                       ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
+                         ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmqr.c b/runtime/openmp/codelets/codelet_ztsmqr.c
index e6beea224..b75e91f3d 100644
--- a/runtime/openmp/codelets/codelet_ztsmqr.c
+++ b/runtime/openmp/codelets/codelet_ztsmqr.c
@@ -140,9 +140,11 @@ void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
index 9a1fe799e..6ba9d4331 100644
--- a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
+++ b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c
@@ -39,9 +39,11 @@ void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k,
-                       ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k,
+                         ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c
index b7561d8a9..8ee1cba55 100644
--- a/runtime/openmp/codelets/codelet_ztsqrt.c
+++ b/runtime/openmp/codelets/codelet_ztsqrt.c
@@ -98,8 +98,10 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
-    CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_ztstrf.c b/runtime/openmp/codelets/codelet_ztstrf.c
index 2748674e5..75d2920ce 100644
--- a/runtime/openmp/codelets/codelet_ztstrf.c
+++ b/runtime/openmp/codelets/codelet_ztstrf.c
@@ -104,7 +104,9 @@ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrU = RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un);
     CHAMELEON_Complex64_t *ptrL = RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0])
-    CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo);
+#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c
index c8567b1f1..fb37bdfaa 100644
--- a/runtime/openmp/codelets/codelet_zttlqt.c
+++ b/runtime/openmp/codelets/codelet_zttlqt.c
@@ -110,8 +110,10 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
-    CHAMELEON_Complex64_t *tau = options->ws_worker;
-    CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
-#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
-    CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
+    {
+      CHAMELEON_Complex64_t tau[options->ws_wsize];
+      CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
+      CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttmlq.c b/runtime/openmp/codelets/codelet_zttmlq.c
index e5093489c..687131351 100644
--- a/runtime/openmp/codelets/codelet_zttmlq.c
+++ b/runtime/openmp/codelets/codelet_zttmlq.c
@@ -132,9 +132,11 @@ void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1,
-                ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1,
+                  ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zttmqr.c b/runtime/openmp/codelets/codelet_zttmqr.c
index b28b47eb3..7a2a10d68 100644
--- a/runtime/openmp/codelets/codelet_zttmqr.c
+++ b/runtime/openmp/codelets/codelet_zttmqr.c
@@ -138,9 +138,11 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
     int ldwork = side == ChamLeft ? ib : nb;
-#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
-    CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
-                ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
+                  ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zunmlq.c b/runtime/openmp/codelets/codelet_zunmlq.c
index fc16b5e13..c9852c457 100644
--- a/runtime/openmp/codelets/codelet_zunmlq.c
+++ b/runtime/openmp/codelets/codelet_zunmlq.c
@@ -121,8 +121,10 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zunmlq(side, trans, m, n, k, ib,
-                ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t work[options->ws_wsize];
+      CORE_zunmlq(side, trans, m, n, k, ib,
+                  ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+    }
 }
diff --git a/runtime/openmp/codelets/codelet_zunmqr.c b/runtime/openmp/codelets/codelet_zunmqr.c
index 207469b5d..1254dbec5 100644
--- a/runtime/openmp/codelets/codelet_zunmqr.c
+++ b/runtime/openmp/codelets/codelet_zunmqr.c
@@ -121,8 +121,10 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options,
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
     CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
-    CHAMELEON_Complex64_t *work = options->ws_worker;
-#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
-    CORE_zunmqr(side, trans, m, n, k, ib,
-                ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
+#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
+    {
+      CHAMELEON_Complex64_t tmp[options->ws_wsize];
+      CORE_zunmqr(side, trans, m, n, k, ib,
+          ptrA, lda, ptrT, ldt, ptrC, ldc, tmp, nb);
+    }
 }
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index ca81fd63c..81e2cd206 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -46,8 +46,10 @@ void RUNTIME_options_finalize( RUNTIME_option_t *option, CHAM_context_t *chamctx
 int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, size_t host_size )
 {
     if (worker_size > 0) {
-        // TODO used for scratch, maybe we can do better than malloc
-        options->ws_worker = malloc(worker_size* sizeof(char));
+        // NOTE: we set the size, but instead of doing a malloc shared by multiple workers,
+        // we just create a VLA in the relevant codelets, within the task's body.
+        // This way we ensure the "scratch" is thread local and not shared by multiple threads.
+        options->ws_worker = NULL;
         options->ws_wsize = worker_size;
     }
     // FIXME: handle ws_host if needed for omp target
-- 
GitLab