From 2c0e30038c80f178e9855fc4a7f2d9364c4c1ab9 Mon Sep 17 00:00:00 2001 From: Philippe Virouleau <philippe.44@gmail.com> Date: Fri, 16 Nov 2018 15:06:01 +0100 Subject: [PATCH] Various fixes for QR. - only use ws_worker for scratch in codelet - add taskwait before freeing the scratch --- control/control.c | 6 ++++++ include/chameleon.h | 12 +++++++++--- runtime/openmp/codelets/codelet_zgelqt.c | 4 ++-- runtime/openmp/codelets/codelet_zgeqrt.c | 4 ++-- runtime/openmp/codelets/codelet_zherfb.c | 4 ++-- runtime/openmp/codelets/codelet_zlacpy.c | 2 +- runtime/openmp/codelets/codelet_zlantr.c | 2 +- runtime/openmp/codelets/codelet_zlaset.c | 2 +- runtime/openmp/codelets/codelet_zplrnt.c | 2 +- runtime/openmp/codelets/codelet_ztplqt.c | 4 ++-- runtime/openmp/codelets/codelet_ztpmlqt.c | 2 +- runtime/openmp/codelets/codelet_ztpmqrt.c | 2 +- runtime/openmp/codelets/codelet_ztpqrt.c | 2 +- runtime/openmp/codelets/codelet_ztslqt.c | 4 ++-- runtime/openmp/codelets/codelet_ztsqrt.c | 4 ++-- runtime/openmp/codelets/codelet_zttlqt.c | 4 ++-- runtime/openmp/codelets/codelet_zttqrt.c | 4 ++-- runtime/openmp/control/runtime_options.c | 13 ++++--------- timing/timing.c | 12 +++++------- 19 files changed, 47 insertions(+), 42 deletions(-) diff --git a/control/control.c b/control/control.c index c0657726e..b34938963 100644 --- a/control/control.c +++ b/control/control.c @@ -50,6 +50,12 @@ * \retval CHAMELEON_SUCCESS successful exit * */ +#ifdef CHAMELEON_Init +#undef CHAMELEON_Init +#endif +#ifdef CHAMELEON_Finalize +#undef CHAMELEON_Finalize +#endif int CHAMELEON_Init(int cores, int gpus) { return CHAMELEON_InitPar(cores, gpus, -1); diff --git a/include/chameleon.h b/include/chameleon.h index a8afc08c8..9166a88e1 100644 --- a/include/chameleon.h +++ b/include/chameleon.h @@ -137,12 +137,18 @@ int CHAMELEON_Sequence_Wait (RUNTIME_sequence_t *sequence); #if defined(CHAMELEON_SCHED_OPENMP) #define CHAMELEON_INIT(nworkers, ncudas)\ - CHAMELEON_Init(nworkers, ncudas);\ - _Pragma("omp parallel")\ - _Pragma("omp master") + CHAMELEON_Init(nworkers, ncudas);\ + _Pragma("omp parallel")\ + _Pragma("omp master")\ + { +#define CHAMELEON_FINALIZE()\ + }\ + CHAMELEON_Finalize(); #else #define CHAMELEON_INIT(nworkers, ncudas)\ CHAMELEON_Init(nworkers, ncudas); +#define CHAMELEON_FINALIZE()\ + CHAMELEON_Finalize(); #endif END_C_DECLS diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c index 8c69936b7..1d284caaa 100644 --- a/runtime/openmp/codelets/codelet_zgelqt.c +++ b/runtime/openmp/codelets/codelet_zgelqt.c @@ -98,7 +98,7 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *TAU = options->ws_worker; - CHAMELEON_Complex64_t *work = options->ws_host; -#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn]) + CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); +#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0]) CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); } diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c index 0337e0de9..aea7735da 100644 --- a/runtime/openmp/codelets/codelet_zgeqrt.c +++ b/runtime/openmp/codelets/codelet_zgeqrt.c @@ -99,7 +99,7 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *TAU = options->ws_worker; - CHAMELEON_Complex64_t *work = options->ws_host; -#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn]) + CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n); +#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0]) CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work); } diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c index 1531406b2..2890651e8 100644 --- a/runtime/openmp/codelets/codelet_zherfb.c +++ b/runtime/openmp/codelets/codelet_zherfb.c @@ -35,7 +35,7 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); - CHAMELEON_Complex64_t *work = options->ws_host; -#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0:Am*An], ptrT[0:Tm*Tn]) depend(inout:ptrC[0:Cm*Cn]) + CHAMELEON_Complex64_t *work = options->ws_worker; +#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); } diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c index a6ab833af..4c8a2c147 100644 --- a/runtime/openmp/codelets/codelet_zlacpy.c +++ b/runtime/openmp/codelets/codelet_zlacpy.c @@ -40,7 +40,7 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A + displA, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B + displB, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn]) +#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) CORE_zlacpy(uplo, m, n, ptrA, lda, ptrB, ldb); } diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c index 08db23b53..00f1c3b7d 100644 --- a/runtime/openmp/codelets/codelet_zlantr.c +++ b/runtime/openmp/codelets/codelet_zlantr.c @@ -32,7 +32,7 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options, { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); double *ptrB = RTBLKADDR(B, double, Bm, Bn); - double *work = options->ws_host; + double *work = options->ws_worker; #pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB); } diff --git a/runtime/openmp/codelets/codelet_zlaset.c b/runtime/openmp/codelets/codelet_zlaset.c index 1dbc2e48f..5f74f186d 100644 --- a/runtime/openmp/codelets/codelet_zlaset.c +++ b/runtime/openmp/codelets/codelet_zlaset.c @@ -69,6 +69,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int LDA) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0:Am*An]) +#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0]) CORE_zlaset(uplo, M, N, alpha, beta, ptrA, LDA); } diff --git a/runtime/openmp/codelets/codelet_zplrnt.c b/runtime/openmp/codelets/codelet_zplrnt.c index fde7a8d2c..ce6b6525a 100644 --- a/runtime/openmp/codelets/codelet_zplrnt.c +++ b/runtime/openmp/codelets/codelet_zplrnt.c @@ -35,6 +35,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int bigM, int m0, int n0, unsigned long long int seed ) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0:Am*An]) +#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0]) CORE_zplrnt( m, n, ptrA, lda, bigM, m0, n0, seed ); } diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c index 2f37931e6..783a610a5 100644 --- a/runtime/openmp/codelets/codelet_ztplqt.c +++ b/runtime/openmp/codelets/codelet_ztplqt.c @@ -30,8 +30,8 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_host; -#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0:Am*An], ptrB[0:Bm*Bn], ptrT[0:Tm*Tn]) + CHAMELEON_Complex64_t *work = options->ws_worker; +#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0]) CORE_ztplqt( M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work ); } diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c index 3746c3041..769c66194 100644 --- a/runtime/openmp/codelets/codelet_ztpmlqt.c +++ b/runtime/openmp/codelets/codelet_ztpmlqt.c @@ -30,7 +30,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_host; + CHAMELEON_Complex64_t *work = options->ws_worker; #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) CORE_ztpmlqt( side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c index a5a42d95c..526017942 100644 --- a/runtime/openmp/codelets/codelet_ztpmqrt.c +++ b/runtime/openmp/codelets/codelet_ztpmqrt.c @@ -30,7 +30,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); - CHAMELEON_Complex64_t *work = options->ws_host; + CHAMELEON_Complex64_t *work = options->ws_worker; #pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) CORE_ztpmqrt( side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c index 8930bc979..26dd08848 100644 --- a/runtime/openmp/codelets/codelet_ztpqrt.c +++ b/runtime/openmp/codelets/codelet_ztpqrt.c @@ -29,7 +29,7 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_host; + CHAMELEON_Complex64_t *work = options->ws_worker; #pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0]) CORE_ztpqrt( M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work ); diff --git a/runtime/openmp/codelets/codelet_ztslqt.c b/runtime/openmp/codelets/codelet_ztslqt.c index 9a3b6db7b..d17db6922 100644 --- a/runtime/openmp/codelets/codelet_ztslqt.c +++ b/runtime/openmp/codelets/codelet_ztslqt.c @@ -109,8 +109,8 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; - CHAMELEON_Complex64_t *tau = options->ws_host; + CHAMELEON_Complex64_t *tau = options->ws_worker; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); } diff --git a/runtime/openmp/codelets/codelet_ztsqrt.c b/runtime/openmp/codelets/codelet_ztsqrt.c index bc16fb146..b7561d8a9 100644 --- a/runtime/openmp/codelets/codelet_ztsqrt.c +++ b/runtime/openmp/codelets/codelet_ztsqrt.c @@ -98,8 +98,8 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; - CHAMELEON_Complex64_t *tau = options->ws_host; + CHAMELEON_Complex64_t *tau = options->ws_worker; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0]) CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); } diff --git a/runtime/openmp/codelets/codelet_zttlqt.c b/runtime/openmp/codelets/codelet_zttlqt.c index e693c6b7a..c8567b1f1 100644 --- a/runtime/openmp/codelets/codelet_zttlqt.c +++ b/runtime/openmp/codelets/codelet_zttlqt.c @@ -110,8 +110,8 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; - CHAMELEON_Complex64_t *tau = options->ws_host; + CHAMELEON_Complex64_t *tau = options->ws_worker; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); } diff --git a/runtime/openmp/codelets/codelet_zttqrt.c b/runtime/openmp/codelets/codelet_zttqrt.c index 5061ef3ba..f01fff932 100644 --- a/runtime/openmp/codelets/codelet_zttqrt.c +++ b/runtime/openmp/codelets/codelet_zttqrt.c @@ -110,8 +110,8 @@ void INSERT_TASK_zttqrt(const RUNTIME_option_t *options, CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *work = options->ws_worker; - CHAMELEON_Complex64_t *tau = options->ws_host; + CHAMELEON_Complex64_t *tau = options->ws_worker; + CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n ); #pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0]) CORE_zttqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work); } diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c index 4511f102b..ca81fd63c 100644 --- a/runtime/openmp/control/runtime_options.c +++ b/runtime/openmp/control/runtime_options.c @@ -50,23 +50,18 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz options->ws_worker = malloc(worker_size* sizeof(char)); options->ws_wsize = worker_size; } - if (host_size > 0) { - // TODO used for scratch, maybe we can do better than malloc - options->ws_host = malloc(host_size * sizeof(char)); - options->ws_hsize = host_size; - } + // FIXME: handle ws_host if needed for omp target return CHAMELEON_SUCCESS; } int RUNTIME_options_ws_free( RUNTIME_option_t *options ) { if (options->ws_wsize) { + // This one is not trivial: the free should be submitted as a task which depends + // on existing task using scratch, but we don't have a dependency for this, so we sync. +#pragma omp taskwait free(options->ws_worker); options->ws_wsize = 0; } - if (options->ws_hsize) { - free(options->ws_host); - options->ws_hsize = 0; - } return CHAMELEON_SUCCESS; } diff --git a/timing/timing.c b/timing/timing.c index 4afdeac80..51eddd529 100644 --- a/timing/timing.c +++ b/timing/timing.c @@ -751,15 +751,13 @@ main(int argc, char *argv[]) { int return_code; /* Initialize CHAMELEON */ - /* NOTE: do *NOT* add a ';' at the end of this call, as it may be a #pragma omp parallel */ CHAMELEON_INIT( iparam[IPARAM_THRDNBR], - iparam[IPARAM_NCUDAS] ) + iparam[IPARAM_NCUDAS] ); + // NOTE: OpenMP needs this, as Chameleon's init/finalize add '{'/'}', + // and 'return' is not allowed in parallel regions. + return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step); - { - return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step); - } - - CHAMELEON_Finalize(); + CHAMELEON_FINALIZE(); return return_code; } -- GitLab