Commit 2c0e3003 authored by Philippe Virouleau's avatar Philippe Virouleau

Various fixes for QR.

  - only use ws_worker for scratch in codelet
  - add taskwait before freeing the scratch
parent b78fbfdd
......@@ -50,6 +50,12 @@
* \retval CHAMELEON_SUCCESS successful exit
*
*/
#ifdef CHAMELEON_Init
#undef CHAMELEON_Init
#endif
#ifdef CHAMELEON_Finalize
#undef CHAMELEON_Finalize
#endif
int CHAMELEON_Init(int cores, int gpus)
{
return CHAMELEON_InitPar(cores, gpus, -1);
......
......@@ -139,10 +139,16 @@ int CHAMELEON_Sequence_Wait (RUNTIME_sequence_t *sequence);
#define CHAMELEON_INIT(nworkers, ncudas)\
CHAMELEON_Init(nworkers, ncudas);\
_Pragma("omp parallel")\
_Pragma("omp master")
_Pragma("omp master")\
{
#define CHAMELEON_FINALIZE()\
}\
CHAMELEON_Finalize();
#else
#define CHAMELEON_INIT(nworkers, ncudas)\
CHAMELEON_Init(nworkers, ncudas);
#define CHAMELEON_FINALIZE()\
CHAMELEON_Finalize();
#endif
END_C_DECLS
......
......@@ -98,7 +98,7 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *TAU = options->ws_worker;
CHAMELEON_Complex64_t *work = options->ws_host;
#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
}
......@@ -99,7 +99,7 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *TAU = options->ws_worker;
CHAMELEON_Complex64_t *work = options->ws_host;
#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0:Am*An]) depend(inout:ptrT[0:Tm*Tn])
CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
#pragma omp task firstprivate(m, n, ib, ptrA, lda, ptrT, ldt, work, TAU) depend(inout:ptrA[0]) depend(inout:ptrT[0])
CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
}
......@@ -35,7 +35,7 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
CHAMELEON_Complex64_t *work = options->ws_host;
#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0:Am*An], ptrT[0:Tm*Tn]) depend(inout:ptrC[0:Cm*Cn])
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
}
......@@ -40,7 +40,7 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
{
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A + displA, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B + displB, CHAMELEON_Complex64_t, Bm, Bn);
#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0:Am*An]) depend(inout:ptrB[0:Bm*Bn])
#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0])
CORE_zlacpy(uplo, m, n, ptrA, lda, ptrB, ldb);
}
......
......@@ -32,7 +32,7 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options,
{
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
double *ptrB = RTBLKADDR(B, double, Bm, Bn);
double *work = options->ws_host;
double *work = options->ws_worker;
#pragma omp task firstprivate(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0])
CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB);
}
......@@ -69,6 +69,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options,
const CHAM_desc_t *A, int Am, int An, int LDA)
{
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0:Am*An])
#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0])
CORE_zlaset(uplo, M, N, alpha, beta, ptrA, LDA);
}
......@@ -35,6 +35,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
int bigM, int m0, int n0, unsigned long long int seed )
{
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0:Am*An])
#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0])
CORE_zplrnt( m, n, ptrA, lda, bigM, m0, n0, seed );
}
......@@ -30,8 +30,8 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_host;
#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0:Am*An], ptrB[0:Bm*Bn], ptrT[0:Tm*Tn])
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt, work) depend(inout:ptrA[0], ptrB[0], ptrT[0])
CORE_ztplqt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, work );
}
......@@ -30,7 +30,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
CHAMELEON_Complex64_t *work = options->ws_host;
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
CORE_ztpmlqt( side, trans, M, N, K, L, ib,
ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
......
......@@ -30,7 +30,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
CHAMELEON_Complex64_t *work = options->ws_host;
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0])
CORE_ztpmqrt( side, trans, M, N, K, L, ib,
ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work );
......
......@@ -29,7 +29,7 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_host;
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb, work) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
CORE_ztpqrt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, work );
......
......@@ -109,8 +109,8 @@ void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_worker;
CHAMELEON_Complex64_t *tau = options->ws_host;
CHAMELEON_Complex64_t *tau = options->ws_worker;
CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
CORE_ztslqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
}
......@@ -98,8 +98,8 @@ void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_worker;
CHAMELEON_Complex64_t *tau = options->ws_host;
CHAMELEON_Complex64_t *tau = options->ws_worker;
CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0])
CORE_ztsqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
}
......@@ -110,8 +110,8 @@ void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_worker;
CHAMELEON_Complex64_t *tau = options->ws_host;
CHAMELEON_Complex64_t *tau = options->ws_worker;
CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
CORE_zttlqt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
}
......@@ -110,8 +110,8 @@ void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n);
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *work = options->ws_worker;
CHAMELEON_Complex64_t *tau = options->ws_host;
CHAMELEON_Complex64_t *tau = options->ws_worker;
CHAMELEON_Complex64_t *work = tau + chameleon_max( m, n );
#pragma omp task firstprivate(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, work, tau) depend(inout:ptrA1[0], ptrA2[0], ptrT[0])
CORE_zttqrt(m, n, ib, ptrA1, lda1, ptrA2, lda2, ptrT, ldt, tau, work);
}
......@@ -50,23 +50,18 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz
options->ws_worker = malloc(worker_size* sizeof(char));
options->ws_wsize = worker_size;
}
if (host_size > 0) {
// TODO used for scratch, maybe we can do better than malloc
options->ws_host = malloc(host_size * sizeof(char));
options->ws_hsize = host_size;
}
// FIXME: handle ws_host if needed for omp target
return CHAMELEON_SUCCESS;
}
int RUNTIME_options_ws_free( RUNTIME_option_t *options )
{
if (options->ws_wsize) {
// This one is not trivial: the free should be submitted as a task which depends
// on existing task using scratch, but we don't have a dependency for this, so we sync.
#pragma omp taskwait
free(options->ws_worker);
options->ws_wsize = 0;
}
if (options->ws_hsize) {
free(options->ws_host);
options->ws_hsize = 0;
}
return CHAMELEON_SUCCESS;
}
......@@ -751,15 +751,13 @@ main(int argc, char *argv[]) {
int return_code;
/* Initialize CHAMELEON */
/* NOTE: do *NOT* add a ';' at the end of this call, as it may be a #pragma omp parallel */
CHAMELEON_INIT( iparam[IPARAM_THRDNBR],
iparam[IPARAM_NCUDAS] )
{
iparam[IPARAM_NCUDAS] );
// NOTE: OpenMP needs this, as Chameleon's init/finalize add '{'/'}',
// and 'return' is not allowed in parallel regions.
return_code = CHAMELEON_Main(iparam, argv[0], start, stop, step);
}
CHAMELEON_Finalize();
CHAMELEON_FINALIZE();
return return_code;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment