Commit 5ca2d21c authored by Philippe Virouleau's avatar Philippe Virouleau
Browse files

Use VLA instead of malloc for scratch

parent beadac2a
......@@ -132,9 +132,11 @@ void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
CHAMELEON_Complex64_t *work = options->ws_worker;
int ldwork = side == ChamLeft ? ib : nb;
#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
{
CHAMELEON_Complex64_t work[options->ws_wsize];
CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1,
ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
}
}
......@@ -138,9 +138,11 @@ void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn);
CHAMELEON_Complex64_t *work = options->ws_worker;
int ldwork = side == ChamLeft ? ib : nb;
#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
#pragma omp task firstprivate(side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0])
{
CHAMELEON_Complex64_t work[options->ws_wsize];
CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork);
}
}
......@@ -121,8 +121,10 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
{
CHAMELEON_Complex64_t work[options->ws_wsize];
CORE_zunmlq(side, trans, m, n, k, ib,
ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
}
}
......@@ -121,8 +121,10 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn);
CHAMELEON_Complex64_t *work = options->ws_worker;
#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
#pragma omp task firstprivate(side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0])
{
CHAMELEON_Complex64_t tmp[options->ws_wsize];
CORE_zunmqr(side, trans, m, n, k, ib,
ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb);
ptrA, lda, ptrT, ldt, ptrC, ldc, tmp, nb);
}
}
......@@ -46,8 +46,10 @@ void RUNTIME_options_finalize( RUNTIME_option_t *option, CHAM_context_t *chamctx
int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, size_t host_size )
{
if (worker_size > 0) {
// TODO used for scratch, maybe we can do better than malloc
options->ws_worker = malloc(worker_size* sizeof(char));
// NOTE: we set the size, but instead of doing a malloc shared by multiple workers,
// we just create a VLA in the relevant codelets, within the task's body.
// This way we ensure the "scratch" is thread local and not shared by multiple threads.
options->ws_worker = NULL;
options->ws_wsize = worker_size;
}
// FIXME: handle ws_host if needed for omp target
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment