From 5ca2d21c710d3f13875dd03dba1e5c20c1124f2d Mon Sep 17 00:00:00 2001 From: Philippe Virouleau <philippe.44@gmail.com> Date: Wed, 5 Dec 2018 14:41:17 +0100 Subject: [PATCH] Use VLA instead of malloc for scratch --- runtime/openmp/codelets/codelet_zgelqt.c | 10 ++++++---- runtime/openmp/codelets/codelet_zgeqrt.c | 10 ++++++---- runtime/openmp/codelets/codelet_zgetrf.c | 2 +- runtime/openmp/codelets/codelet_zherfb.c | 8 +++++--- runtime/openmp/codelets/codelet_zlange.c | 8 +++++--- runtime/openmp/codelets/codelet_zlanhe.c | 8 +++++--- runtime/openmp/codelets/codelet_zlansy.c | 8 +++++--- runtime/openmp/codelets/codelet_zlantr.c | 8 +++++--- runtime/openmp/codelets/codelet_ztplqt.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztpmlqt.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztpmqrt.c | 8 +++++--- runtime/openmp/codelets/codelet_ztpqrt.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztslqt.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztsmlq.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztsmlq_hetra1.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztsmqr.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztsmqr_hetra1.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztsqrt.c | 10 ++++++---- runtime/openmp/codelets/codelet_ztstrf.c | 8 +++++--- runtime/openmp/codelets/codelet_zttlqt.c | 10 ++++++---- runtime/openmp/codelets/codelet_zttmlq.c | 10 ++++++---- runtime/openmp/codelets/codelet_zttmqr.c | 10 ++++++---- runtime/openmp/codelets/codelet_zunmlq.c | 10 ++++++---- runtime/openmp/codelets/codelet_zunmqr.c | 10 ++++++---- runtime/openmp/control/runtime_options.c | 6 ++++-- 25 files changed, 136 insertions(+), 88 deletions(-) diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c index 1d284caaa..04786dd80 100644 --- a/runtime/openmp/codelets/codelet_zgelqt.c +++ b/runtime/openmp/codelets/codelet_zgelqt.c @@ -97,8 +97,10 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *TAU = options->ws_worker; - CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); -#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0]) - CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); +#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0]) + { + CHAMELEON_Complex64_t TAU[options->ws_wsize]; + CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); + CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); + } } diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c index aea7735da..7b1b8e13e 100644 --- a/runtime/openmp/codelets/codelet_zgeqrt.c +++ b/runtime/openmp/codelets/codelet_zgeqrt.c @@ -98,8 +98,10 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *TAU = options->ws_worker; - CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n); -#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0]) - CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); +#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0]) + { + CHAMELEON_Complex64_t TAU[options->ws_wsize]; + CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n); + CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); + } } diff --git a/runtime/openmp/codelets/codelet_zgetrf.c b/runtime/openmp/codelets/codelet_zgetrf.c index 4bb4173ac..27c599ed1 100644 --- a/runtime/openmp/codelets/codelet_zgetrf.c +++ b/runtime/openmp/codelets/codelet_zgetrf.c @@ -34,6 +34,6 @@ void INSERT_TASK_zgetrf(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); int info = 0; -#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0:Am*An]) +#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(inout:ptrA[0]) CORE_zgetrf( m, n, ptrA, lda, IPIV, &info ); } diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c index 2890651e8..3ed5263e6 100644 --- a/runtime/openmp/codelets/codelet_zherfb.c +++ b/runtime/openmp/codelets/codelet_zherfb.c @@ -35,7 +35,9 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) - CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); +#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); + } } diff --git a/runtime/openmp/codelets/codelet_zlange.c b/runtime/openmp/codelets/codelet_zlange.c index 1358fc57d..5c5c99dd9 100644 --- a/runtime/openmp/codelets/codelet_zlange.c +++ b/runtime/openmp/codelets/codelet_zlange.c @@ -33,9 +33,11 @@ void INSERT_TASK_zlange(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); double *ptrB = RTBLKADDR(B, double, Bm, Bn); - double *work = options->ws_worker; -#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options, work) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn]) - CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB); +#pragma omp task firstprivate(M, N, ptrA, LDA, ptrB, options) depend(in:ptrA[0]) depend(inout:ptrB[0]) + { + double work[options->ws_wsize]; + CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB); + } } void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, diff --git a/runtime/openmp/codelets/codelet_zlanhe.c b/runtime/openmp/codelets/codelet_zlanhe.c index f77acaad2..be4bc57f3 100644 --- a/runtime/openmp/codelets/codelet_zlanhe.c +++ b/runtime/openmp/codelets/codelet_zlanhe.c @@ -32,8 +32,10 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *work = options->ws_worker; double *normA = RTBLKADDR(B, double, Bm, Bn); -#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn]) - CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA); +#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0]) + { + double work[options->ws_wsize]; + CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA); + } } diff --git a/runtime/openmp/codelets/codelet_zlansy.c b/runtime/openmp/codelets/codelet_zlansy.c index c3dd736fa..3118f450f 100644 --- a/runtime/openmp/codelets/codelet_zlansy.c +++ b/runtime/openmp/codelets/codelet_zlansy.c @@ -32,8 +32,10 @@ void INSERT_TASK_zlansy(const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *work = options->ws_worker; double *normA = RTBLKADDR(B, double, Bm, Bn); -#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, work, normA) depend(in:ptrA[0:Am*An]) depend(inout:normA[0:Bm*Bn]) - CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA); +#pragma omp task firstprivate(norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0]) + { + double work[options->ws_wsize]; + CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA); + } } diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c index 00f1c3b7d..6b5ae1168 100644 --- a/runtime/openmp/codelets/codelet_zlantr.c +++ b/runtime/openmp/codelets/codelet_zlantr.c @@ -32,7 +32,9 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); double *ptrB = RTBLKADDR(B, double, Bm, Bn); - double *work = options->ws_worker; -#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB); +#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) + { + double work[options->ws_wsize]; + CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB); + } } diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c index 783a610a5..b587f5d5e 100644 --- a/runtime/openmp/codelets/codelet_ztplqt.c +++ b/runtime/openmp/codelets/codelet_ztplqt.c @@ -30,8 +30,10 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0]) - CORE_ztplqt( M, N, L, ib, - ptrA, lda, ptrB, ldb, ptrT, ldt, work ); +#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztplqt( M, N, L, ib, + ptrA, lda, ptrB, ldb, ptrT, ldt, work ); + } } diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c index 769c66194..3601df373 100644 --- a/runtime/openmp/codelets/codelet_ztpmlqt.c +++ b/runtime/openmp/codelets/codelet_ztpmlqt.c @@ -30,8 +30,10 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) - CORE_ztpmlqt( side, trans, M, N, K, L, ib, - ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); +#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztpmlqt( side, trans, M, N, K, L, ib, + ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); + } } diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c index 526017942..3d5225a9e 100644 --- a/runtime/openmp/codelets/codelet_ztpmqrt.c +++ b/runtime/openmp/codelets/codelet_ztpmqrt.c @@ -30,8 +30,10 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) +#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) + { + CHAMELEON_Complex64_t tmp[options->ws_wsize]; CORE_ztpmqrt( side, trans, M, N, K, L, ib, - ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); + ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, tmp ); + } } diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c index 26dd08848..f25eb1684 100644 --- a/runtime/openmp/codelets/codelet_ztpqrt.c +++ b/runtime/openmp/codelets/codelet_ztpqrt.c @@ -29,8 +29,10 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0]) - CORE_ztpqrt( M, N, L, ib, - ptrA, lda, ptrB, ldb, ptrT, ldt, work ); +#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0]) + { + CHAMELEON_Complex64_t tmp[options->ws_wsize]; + CORE_ztpqrt( M, N, L, ib, + ptrA, lda, ptrB, ldb, ptrT, ldt, tmp ); + } } diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c index d17db6922..3d9b75e9c 100644 --- a/runtime/openmp/codelets/codelet_ztslqt.c +++ b/runtime/openmp/codelets/codelet_ztslqt.c @@ -109,8 +109,10 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *tau = options->ws_worker; - CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); -#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) - CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); +#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) + { + CHAMELEON_Complex64_t tau[options->ws_wsize]; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); + CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); + } } diff --git a/runtime/openmp/codelets/codelet_ztsmlq.c b/runtime/openmp/codelets/codelet_ztsmlq.c index 83611cf1f..6b11db9fb 100644 --- a/runtime/openmp/codelets/codelet_ztsmlq.c +++ b/runtime/openmp/codelets/codelet_ztsmlq.c @@ -140,9 +140,11 @@ void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, - ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, + ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c index 57071465d..2ba3f99f5 100644 --- a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c @@ -39,9 +39,11 @@ void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, - ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, + ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_ztsmqr.c b/runtime/openmp/codelets/codelet_ztsmqr.c index e6beea224..b75e91f3d 100644 --- a/runtime/openmp/codelets/codelet_ztsmqr.c +++ b/runtime/openmp/codelets/codelet_ztsmqr.c @@ -140,9 +140,11 @@ void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, - ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, + ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c index 9a1fe799e..6ba9d4331 100644 --- a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c @@ -39,9 +39,11 @@ void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, - ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, + ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c index b7561d8a9..8ee1cba55 100644 --- a/runtime/openmp/codelets/codelet_ztsqrt.c +++ b/runtime/openmp/codelets/codelet_ztsqrt.c @@ -98,8 +98,10 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *tau = options->ws_worker; - CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); -#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0]) - CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); +#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0]) + { + CHAMELEON_Complex64_t tau[options->ws_wsize]; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); + CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); + } } diff --git a/runtime/openmp/codelets/codelet_ztstrf.c b/runtime/openmp/codelets/codelet_ztstrf.c index 2748674e5..75d2920ce 100644 --- a/runtime/openmp/codelets/codelet_ztstrf.c +++ b/runtime/openmp/codelets/codelet_ztstrf.c @@ -104,7 +104,9 @@ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrU = RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un); CHAMELEON_Complex64_t *ptrL = RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0]) - CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo); +#pragma omp task firstprivate(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo); + } } diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c index c8567b1f1..fb37bdfaa 100644 --- a/runtime/openmp/codelets/codelet_zttlqt.c +++ b/runtime/openmp/codelets/codelet_zttlqt.c @@ -110,8 +110,10 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *tau = options->ws_worker; - CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); -#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) - CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); +#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) + { + CHAMELEON_Complex64_t tau[options->ws_wsize]; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); + CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); + } } diff --git a/runtime/openmp/codelets/codelet_zttmlq.c b/runtime/openmp/codelets/codelet_zttmlq.c index e5093489c..687131351 100644 --- a/runtime/openmp/codelets/codelet_zttmlq.c +++ b/runtime/openmp/codelets/codelet_zttmlq.c @@ -132,9 +132,11 @@ void INSERT_TASK_zttmlq(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, - ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, + ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_zttmqr.c b/runtime/openmp/codelets/codelet_zttmqr.c index b28b47eb3..7a2a10d68 100644 --- a/runtime/openmp/codelets/codelet_zttmqr.c +++ b/runtime/openmp/codelets/codelet_zttmqr.c @@ -138,9 +138,11 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_worker; int ldwork = side == ChamLeft ? ib : nb; -#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) - CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, - ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); +#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, + ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + } } diff --git a/runtime/openmp/codelets/codelet_zunmlq.c b/runtime/openmp/codelets/codelet_zunmlq.c index fc16b5e13..c9852c457 100644 --- a/runtime/openmp/codelets/codelet_zunmlq.c +++ b/runtime/openmp/codelets/codelet_zunmlq.c @@ -121,8 +121,10 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) - CORE_zunmlq(side, trans, m, n, k, ib, - ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); +#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) + { + CHAMELEON_Complex64_t work[options->ws_wsize]; + CORE_zunmlq(side, trans, m, n, k, ib, + ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); + } } diff --git a/runtime/openmp/codelets/codelet_zunmqr.c b/runtime/openmp/codelets/codelet_zunmqr.c index 207469b5d..1254dbec5 100644 --- a/runtime/openmp/codelets/codelet_zunmqr.c +++ b/runtime/openmp/codelets/codelet_zunmqr.c @@ -121,8 +121,10 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); - CHAMELEON_Complex64_t *work = options->ws_worker; -#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) - CORE_zunmqr(side, trans, m, n, k, ib, - ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); +#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) + { + CHAMELEON_Complex64_t tmp[options->ws_wsize]; + CORE_zunmqr(side, trans, m, n, k, ib, + ptrA, lda, ptrT, ldt, ptrC, ldc, tmp, nb); + } } diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c index ca81fd63c..81e2cd206 100644 --- a/runtime/openmp/control/runtime_options.c +++ b/runtime/openmp/control/runtime_options.c @@ -46,8 +46,10 @@ void RUNTIME_options_finalize( RUNTIME_option_t *option, CHAM_context_t *chamctx int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, size_t host_size ) { if (worker_size > 0) { - // TODO used for scratch, maybe we can do better than malloc - options->ws_worker = malloc(worker_size* sizeof(char)); + // NOTE: we set the size, but instead of doing a malloc shared by multiple workers, + // we just create a VLA in the relevant codelets, within the task's body. + // This way we ensure the "scratch" is thread local and not shared by multiple threads. + options->ws_worker = NULL; options->ws_wsize = worker_size; } // FIXME: handle ws_host if needed for omp target -- GitLab