Commit 79436c1a authored by Mathieu Faverge's avatar Mathieu Faverge

Cleanup on QR/LQ algorithms to reuse D matrix is possible

parent f19a75ed
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
*/ */
#include "control/common.h" #include "control/common.h"
void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
int k; int k;
int tempkm, tempkn; int tempkm, tempkn;
...@@ -38,12 +38,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -38,12 +38,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, D->m-k*D->mb, tempkn); D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, D->m-k*D->mb, tempkn);
} }
chameleon_pzgeqrf( A1, T1, D1, chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request); sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans, chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1, A1, A2, T1, D1,
sequence, request); sequence, request);
if (k+1 < A->nt){ if (k+1 < A->nt){
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
...@@ -55,12 +55,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -55,12 +55,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, (k+1)*D->nb, tempkm, D->n-(k+1)*D->nb); D1 = chameleon_desc_submatrix(D, k*D->mb, (k+1)*D->nb, tempkm, D->n-(k+1)*D->nb);
} }
chameleon_pzgelqf( A1, T1, D1, chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request); sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans, chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1, A1, A2, T1, D1,
sequence, request); sequence, request);
} }
} }
} }
...@@ -74,12 +74,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -74,12 +74,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
if ( D != NULL ) { if ( D != NULL ) {
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, tempkm, D->n-k*D->nb); D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, tempkm, D->n-k*D->nb);
} }
chameleon_pzgelqf( A1, T1, D1, chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request); sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans, chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1, A1, A2, T1, D1,
sequence, request); sequence, request);
if (k+1 < A->mt){ if (k+1 < A->mt){
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
...@@ -91,12 +91,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -91,12 +91,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, (k+1)*D->mb, k*D->nb, D->m-(k+1)*D->mb, tempkn); D1 = chameleon_desc_submatrix(D, (k+1)*D->mb, k*D->nb, D->m-(k+1)*D->mb, tempkn);
} }
chameleon_pzgeqrf( A1, T1, D1, chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request); sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans, chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1, A1, A2, T1, D1,
sequence, request); sequence, request);
} }
} }
} }
......
...@@ -27,17 +27,13 @@ ...@@ -27,17 +27,13 @@
#define A(m,n) A, m, n #define A(m,n) A, m, n
#define T(m,n) T, m, n #define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG) #define D(k) D, k, k
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/** /**
* Parallel tile LQ factorization - dynamic scheduling * Parallel tile LQ factorization - dynamic scheduling
*/ */
void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -63,7 +59,8 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -63,7 +59,8 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
} }
if ( D == NULL ) { if ( D == NULL ) {
D = A; D = A;
genD = 0;
} }
/* /*
...@@ -100,8 +97,7 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -100,8 +97,7 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb, tempkm, tempkn, ib, T->nb,
A(k, k), ldak, A(k, k), ldak,
T(k, k), T->mb); T(k, k), T->mb);
if ( k < (A->mt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, A->mb, A->nb, A->nb, ChamUpper, A->mb, A->nb, A->nb,
...@@ -113,7 +109,6 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -113,7 +109,6 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamLower, A->mb, A->nb, ChamLower, A->mb, A->nb,
0., 1., 0., 1.,
D(k), ldak ); D(k), ldak );
#endif
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
......
...@@ -30,9 +30,9 @@ ...@@ -30,9 +30,9 @@
/* /*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/ */
void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -55,7 +55,8 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -55,7 +55,8 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ib = CHAMELEON_IB; ib = CHAMELEON_IB;
if ( D == NULL ) { if ( D == NULL ) {
D = A; D = A;
genD = 0;
} }
/* /*
...@@ -108,8 +109,7 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -108,8 +109,7 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkm, temppn, ib, T->nb, tempkm, temppn, ib, T->nb,
A( k, p), ldak, A( k, p), ldak,
T(k, p), T->mb); T(k, p), T->mb);
if ( k < (A->mt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkm, temppn, A->nb, ChamUpper, tempkm, temppn, A->nb,
...@@ -121,7 +121,6 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -121,7 +121,6 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamLower, tempkm, temppn, ChamLower, tempkm, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), ldak );
#endif
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
......
...@@ -29,17 +29,13 @@ ...@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n) #define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n) #define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+A->nt #define T2(m,n) T, (m), (n)+A->nt
#if defined(CHAMELEON_COPY_DIAG) #define D(m,n) D, (m), (n)
#define D(m,n) D, ((n)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
/* /*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/ */
void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -59,13 +55,16 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -59,13 +55,16 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB; ib = CHAMELEON_IB;
/* if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgelqt = A->nb * (ib+1) * zgelqt = A->nb * (ib+1)
* zunmlq = A->nb * ib * zunmlq = A->nb * ib
* ztslqt = A->nb * (ib+1) * ztplqt = A->nb * (ib+1)
* zttlqt = A->nb * (ib+1) * ztpmlq = A->nb * ib
* ztsmlq = A->nb * ib
* zttmlq = A->nb * ib
*/ */
ws_worker = A->nb * (ib+1); ws_worker = A->nb * (ib+1);
...@@ -74,7 +73,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -74,7 +73,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space /* Worker space
* *
* zunmqr = A->nb * ib * zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib * ztpmqr = 2 * A->nb * ib
*/ */
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif #endif
...@@ -100,8 +99,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -100,8 +99,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempkm, tempNn, ib, T->nb, tempkm, tempNn, ib, T->nb,
A(k, N), ldak, A(k, N), ldak,
T(k, N), T->mb); T(k, N), T->mb);
if ( k < (A->mt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkm, tempNn, A->nb, ChamUpper, tempkm, tempNn, A->nb,
...@@ -113,7 +111,6 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -113,7 +111,6 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ChamLower, tempkm, tempNn, ChamLower, tempkm, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), ldak );
#endif
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
......
...@@ -27,17 +27,13 @@ ...@@ -27,17 +27,13 @@
#define A(m,n) A, m, n #define A(m,n) A, m, n
#define T(m,n) T, m, n #define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k #define D(k) D, k, k
#endif
/** /**
* Parallel tile QR factorization - dynamic scheduling * Parallel tile QR factorization - dynamic scheduling
*/ */
void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -58,7 +54,8 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -58,7 +54,8 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ib = CHAMELEON_IB; ib = CHAMELEON_IB;
if ( D == NULL ) { if ( D == NULL ) {
D = A; D = A;
genD = 0;
} }
/* /*
...@@ -95,8 +92,7 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -95,8 +92,7 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb, tempkm, tempkn, ib, T->nb,
A(k, k), ldak, A(k, k), ldak,
T(k, k), T->mb); T(k, k), T->mb);
if ( k < (A->nt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, A->mb, A->nb, A->nb, ChamLower, A->mb, A->nb, A->nb,
...@@ -108,7 +104,6 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, ...@@ -108,7 +104,6 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamUpper, A->mb, A->nb, ChamUpper, A->mb, A->nb,
0., 1., 0., 1.,
D(k), ldak ); D(k), ldak );
#endif
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
......
...@@ -29,9 +29,9 @@ ...@@ -29,9 +29,9 @@
/** /**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling * Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/ */
void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -40,7 +40,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -40,7 +40,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n, i, p; int k, m, n, i, p;
int K, L; int K, L, nbgeqrt;
int ldap, ldam; int ldap, ldam;
int tempkmin, tempkn, tempnn, tempmm; int tempkmin, tempkn, tempnn, tempmm;
int ib; int ib;
...@@ -54,25 +54,23 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -54,25 +54,23 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ib = CHAMELEON_IB; ib = CHAMELEON_IB;
if ( D == NULL ) { if ( D == NULL ) {
D = A; D = A;
genD = 0;
} }
/* /*
* zgeqrt = A->nb * (ib+1) * zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib * zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1) * ztpqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1) * ztpmqrt = A->nb * ib
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
*/ */
ws_worker = A->nb * (ib+1); ws_worker = A->nb * (ib+1);
/* Allocation of temporary (scratch) working space */ /* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
/* Worker space /*
* * zunmqr = A->nb * ib
* zunmqr = A->nb * ib * ztpmqrt = 2 * A->nb * ib
* ztsmqr = 2 * A->nb * ib
*/ */
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif #endif
...@@ -93,7 +91,8 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -93,7 +91,8 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
/* The number of geqrt to apply */ /* The number of geqrt to apply */
for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { nbgeqrt = qrtree->getnbgeqrf(qrtree, k);
for (i = 0; i < nbgeqrt; i++) {
m = qrtree->getm(qrtree, k, i); m = qrtree->getm(qrtree, k, i);
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn); tempkmin = chameleon_min(tempmm, tempkn);
...@@ -106,8 +105,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -106,8 +105,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempmm, tempkn, ib, T->nb, tempmm, tempkn, ib, T->nb,
A(m, k), ldam, A(m, k), ldam,
T(m, k), T->mb); T(m, k), T->mb);
if ( k < (A->nt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempmm, A->nb, A->nb, ChamLower, tempmm, A->nb, A->nb,
...@@ -119,7 +117,6 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A, ...@@ -119,7 +117,6 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamUpper, tempmm, A->nb, ChamUpper, tempmm, A->nb,
0., 1., 0., 1.,
D(m, k), ldam ); D(m, k), ldam );
#endif
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
......
...@@ -29,17 +29,13 @@ ...@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n) #define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n) #define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), ((n)+A->nt) #define T2(m,n) T, (m), ((n)+A->nt)
#if defined(CHAMELEON_COPY_DIAG) #define D(m,n) D, (m), (n)
#define D(m,n) D, ((m)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
/** /**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling * Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/ */
void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -59,13 +55,16 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -59,13 +55,16 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB; ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/* /*
* zgeqrt = A->nb * (ib+1) * zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib * zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1) * ztpqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1) * ztpmqr = A->nb * ib
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
*/ */
ws_worker = A->nb * (ib+1); ws_worker = A->nb * (ib+1);
...@@ -74,7 +73,7 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -74,7 +73,7 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space /* Worker space
* *
* zunmqr = A->nb * ib * zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib * ztpmqr = 2 * A->nb * ib
*/ */
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif #endif
...@@ -99,20 +98,18 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS, ...@@ -99,20 +98,18 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempMm, tempkn, ib, T->nb, tempMm, tempkn, ib, T->nb,
A(M, k), ldaM, A(M, k), ldaM,
T(M, k), T->mb); T(M, k), T->mb);
if ( k < (A->nt-1) ) { if ( genD ) {
#if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy(
INSERT_TASK_zlacpy( &options,
&options, ChamLower, tempMm, A->nb, A->nb,
ChamLower, tempMm, A->nb, A->nb, A(M, k), ldaM,
A(M, k), ldaM, D(M, k), ldaM );
D(M, k), ldaM );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempMm, A->nb, ChamUpper, tempMm, A->nb,
0., 1., 0., 1.,
D(M, k), ldaM ); D(M, k), ldaM );
#endif
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
......
...@@ -26,21 +26,17 @@ ...@@ -26,21 +26,17 @@
#define T2(m,n) T2, m, n #define T2(m,n) T2, m, n
#define Q1(m,n) Q1, m, n #define Q1(m,n) Q1, m, n
#define Q2(m,n) Q2, m, n #define Q2(m,n) Q2, m, n
#if defined(CHAMELEON_COPY_DIAG)