Commit 729f9461 authored by PRUVOST Florent's avatar PRUVOST Florent

copy the diag tile even in homogeneous to allow tsqrt and unmqr concurent executions

parent a2c4f527
......@@ -32,11 +32,8 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(k) DIAG, k, 0
#else
#define DIAG(k) A, k, k
#endif
/***************************************************************************//**
* Parallel tile LQ factorization - dynamic scheduling
**/
......@@ -77,9 +74,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
/* necessary to use UNMLQ on GPU */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb);
/* Worker space
*
* zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) )
......@@ -104,6 +98,10 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb);
for (k = 0; k < min(A->mt, A->nt); k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -113,20 +111,20 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
#if defined(CHAMELEON_USE_MAGMA)
if ( k < (A->mt-1) ) {
MORSE_TASK_zlacpy(
&options,
MorseUpper, A->mb, A->nb, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, A->mb, A->nb,
0., 1.,
DIAG(k), A->mb );
}
#endif
}
for (m = k+1; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
......@@ -164,8 +162,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -36,11 +36,8 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+A->nt
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(m,n) DIAG, ((n)/BS), 0
#else
#define DIAG(m,n) A, (m), (n)
#endif
/***************************************************************************//**
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
**/
......@@ -58,6 +55,7 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
int ldak, ldam;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -78,12 +76,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
{
/* necessary to use UNMLQ on GPU */
int nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
}
/* Worker space
*
* zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) )
......@@ -108,6 +100,11 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
for (k = 0; k < min(A->mt, A->nt); k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
......@@ -119,20 +116,20 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
tempkm, tempNn, ib, T->nb,
A(k, N), ldak,
T(k, N), T->mb);
#if defined(CHAMELEON_USE_MAGMA)
if ( k < (A->mt-1) ) {
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkm, tempNn, A->nb,
A(k, N), ldak,
DIAG(k, N), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkm, tempNn,
0., 1.,
DIAG(k, N), ldak );
}
#endif
}
for (m = k+1; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
......@@ -196,8 +193,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -32,11 +32,8 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(k) DIAG, k, 0
#else
#define DIAG(k) A, k, k
#endif
/***************************************************************************//**
* Parallel tile QR factorization - dynamic scheduling
**/
......@@ -72,9 +69,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
/* necessary to use UNMQR on GPU */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
/* Worker space
*
* zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
......@@ -99,6 +93,10 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
for (k = 0; k < minMNT; k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -108,20 +106,20 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
#if defined(CHAMELEON_USE_MAGMA)
if ( k < (A->nt-1) ) {
MORSE_TASK_zlacpy(
&options,
MorseLower, A->mb, A->nb, A->nb,
A(k, k), ldak,
DIAG(k), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseUpper, A->mb, A->nb,
0., 1.,
DIAG(k), ldak );
}
#endif
}
for (n = k+1; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
MORSE_TASK_zunmqr(
......@@ -158,8 +156,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -34,11 +34,7 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), ((n)+A->nt)
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(m,n) DIAG, ((m)/BS), 0
#else
#define DIAG(m,n) A, (m), (n)
#endif
/***************************************************************************//**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling
......@@ -57,6 +53,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
int ldaM, ldam, ldaMRD;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -77,12 +74,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
{
int nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
}
/* Worker space
*
* zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
......@@ -107,6 +98,11 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
K = min(A->mt, A->nt);
for (k = 0; k < K; k++) {
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -119,20 +115,20 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
tempMm, tempkn, ib, T->nb,
A(M, k), ldaM,
T(M, k), T->mb);
#if defined(CHAMELEON_USE_MAGMA)
if ( k < (A->nt-1) ) {
MORSE_TASK_zlacpy(
&options,
MorseLower, tempMm, A->nb, A->nb,
A(M, k), ldaM,
DIAG(M, k), ldaM );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseUpper, tempMm, A->nb,
0., 1.,
DIAG(M, k), ldaM );
}
#endif
}
for (n = k+1; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
MORSE_TASK_zunmqr(
......@@ -196,8 +192,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -33,11 +33,7 @@
#include "common.h"
#define A(_m_,_n_) A, _m_, _n_
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(_k_) DIAG, _k_, 0
#else
#define DIAG(_k_) A, _k_, _k_
#endif
#define L(_m_,_n_) L, _m_, _n_
#define IPIV(_m_,_n_) &(IPIV[(int64_t)A->mb*((int64_t)(_m_)+(int64_t)A->mt*(int64_t)(_n_))])
......@@ -65,9 +61,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
ib = MORSE_IB;
#if defined(CHAMELEON_USE_MAGMA)
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
h_work_size = sizeof(MORSE_Complex64_t)*( 2*ib + 2*L->nb )*2*A->mb;
d_work_size = sizeof(MORSE_Complex64_t)*( ib )*2*A->mb;
#else
......@@ -76,6 +69,10 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
#endif
RUNTIME_options_ws_alloc( &options, h_work_size, d_work_size );
/* necessary to avoid dependencies between tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
for (k = 0; k < minMNT; k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -88,7 +85,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
IPIV(k, k),
k == A->mt-1, A->nb*k);
#if defined(CHAMELEON_USE_MAGMA)
if ( k < (minMNT-1) ) {
MORSE_TASK_zlacpy(
&options,
......@@ -96,7 +92,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
A(k, k), ldak,
DIAG(k), ldak);
}
#endif
for (n = k+1; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
......@@ -137,8 +132,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -33,11 +33,8 @@
#define A(m,n) A, m, n
#define Q(m,n) Q, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(k) DIAG, k, 0
#else
#define DIAG(k) A, k, k
#endif
/***************************************************************************//**
* Parallel construction of Q using tile V (application to identity) - dynamic scheduling
**/
......@@ -77,8 +74,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
/* Worker space
*
* zunmlq = A->nb * ib
......@@ -92,6 +87,10 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
for (k = min(A->mt, A->nt)-1; k >= 0; k--) {
tempAkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -113,12 +112,12 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
T(k, n), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempkn,
......@@ -141,8 +140,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -33,11 +33,8 @@
#define Q(m,n) Q, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+(A->nt)
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(m,n) DIAG, ((n)/BS), 0
#else
#define DIAG(m,n) A, (m), (n)
#endif
/**
* Parallel construction of Q using tile V (application to identity;
* reduction Householder) - dynamic scheduling
......@@ -58,6 +55,7 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
int ldqm;
int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -74,12 +72,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
ws_worker = A->nb * ib;
#if defined(CHAMELEON_USE_MAGMA)
{
/* necessary to use UNMLQ on GPU */
int nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
}
/* Worker space
*
* zunmqr = A->nb * ib
......@@ -93,6 +85,11 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
K = min(A->mt, A->nt);
for (k = K-1; k >= 0; k--) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
......@@ -138,12 +135,12 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
T(k, n), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak,
DIAG(k, N), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempNn,
......@@ -168,8 +165,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -33,11 +33,8 @@
#define A(m,n) A, m, n
#define Q(m,n) Q, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(k) DIAG, k, 0
#else
#define DIAG(k) A, k, k
#endif
/***************************************************************************//**
* Parallel construction of Q using tile V (application to identity) - dynamic scheduling
**/
......@@ -71,8 +68,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_MAGMA)
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
/* Worker space
*
* zunmqr = A->nb * ib
......@@ -86,6 +81,10 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
for (k = min(A->mt, A->nt)-1; k >= 0; k--) {
tempAkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -109,12 +108,12 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
T(m, k), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseLower, tempkm, tempkmin, A->nb,
A(k, k), ldak,
DIAG(k), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseUpper, tempkm, tempkmin,
......@@ -136,8 +135,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -35,11 +35,7 @@
#define Q(m,n) Q, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+(A->nt)
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(m,n) DIAG, ((m)/BS), 0
#else
#define DIAG(m,n) A, (m), (n)
#endif
/**
* Parallel construction of Q using tile V (application to identity;
......@@ -61,6 +57,7 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
int ldbM, ldbm, ldbMRD;
int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -77,12 +74,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
ws_worker = A->nb * ib;
#if defined(CHAMELEON_USE_MAGMA)
{
int nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
}
/* Worker space
*
* zunmqr = A->nb * ib
......@@ -96,6 +87,11 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
K = min(A->mt, A->nt);
for (k = K-1; k >= 0; k--) {
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -145,12 +141,12 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
T(m, k), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseLower, tempMm, tempkmin, A->nb,
A(M, k), ldaM,
DIAG(M, k), ldaM );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseUpper, tempMm, tempkmin,
......@@ -174,8 +170,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -34,11 +34,8 @@
#define A(m,n) A, m, n
#define B(m,n) B, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(k) DIAG, k, 0
#else
#define DIAG(k) A, k, k
#endif
/***************************************************************************//**
* Parallel application of Q using tile V - LQ factorization - dynamic scheduling
**/
......@@ -79,9 +76,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
ws_worker = A->nb * ib;
#if defined(CHAMELEON_USE_MAGMA)
/* necessary to use UNMLQ on GPU */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
/* Worker space
*
* zunmlq = A->nb * ib
......@@ -95,6 +89,10 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
if (side == MorseLeft ) {
if (trans == MorseNoTrans) {
/*
......@@ -105,12 +103,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempkm, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempkm,
......@@ -168,12 +166,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
T(k, m), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempkm, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempkm,
......@@ -217,12 +215,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
T(k, n), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempkn,
......@@ -250,12 +248,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
tempkn = k == B->nt -1 ? B->n -k*B->nb : B->nb;
tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
DIAG(k), A->mb );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempkn,
......@@ -295,8 +293,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
RUNTIME_options_finalize(&options, morse);
MORSE_TASK_dataflush_all();
#if defined(CHAMELEON_USE_MAGMA)
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
}
......@@ -35,11 +35,8 @@
#define B(m,n) B, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+A->nt
#if defined(CHAMELEON_USE_MAGMA)
#define DIAG(m,n) DIAG, ((n)/BS), 0
#else
#define DIAG(m,n) A, (m), (n)
#endif
/***************************************************************************//**
* Parallel application of Q using tile V - LQ factorization (reduction
* Householder) - dynamic scheduling
......@@ -60,6 +57,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
int ldbN, ldbm, ldbNRD;
int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -76,12 +74,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
ws_worker = A->nb * ib;
#if defined(CHAMELEON_USE_MAGMA)
{
/* necessary to use UNMLQ on GPU */
int nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
}
/* Worker space
*
* zunmlq = A->nb * ib
......@@ -95,6 +87,11 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
K = min(A->mt, A->nt);
if (side == MorseLeft ) {
if (trans == MorseNoTrans) {
......@@ -109,12 +106,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
tempkmin = min(tempkm,tempNn);
ldaN = BLKLDD(A, N);
ldbN = BLKLDD(B, N);
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak,
DIAG(k, N), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkmin, tempNn,
......@@ -219,12 +216,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
T(k, m), T->mb);
}
}
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak,
DIAG(k, N), ldak );
#if defined(CHAMELEON_USE_MAGMA)
MORSE_TASK_zlaset(
&options,