Commit 79436c1a authored by Mathieu Faverge's avatar Mathieu Faverge

Cleanup on QR/LQ algorithms to reuse D matrix is possible

parent f19a75ed
......@@ -20,8 +20,8 @@
*/
#include "control/common.h"
void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
int k;
int tempkm, tempkn;
......@@ -38,10 +38,10 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, D->m-k*D->mb, tempkn);
}
chameleon_pzgeqrf( A1, T1, D1,
chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans,
chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
......@@ -55,10 +55,10 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, (k+1)*D->nb, tempkm, D->n-(k+1)*D->nb);
}
chameleon_pzgelqf( A1, T1, D1,
chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans,
chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
}
......@@ -74,10 +74,10 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
if ( D != NULL ) {
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, tempkm, D->n-k*D->nb);
}
chameleon_pzgelqf( A1, T1, D1,
chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans,
chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
......@@ -91,10 +91,10 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, (k+1)*D->mb, k*D->nb, D->m-(k+1)*D->mb, tempkn);
}
chameleon_pzgeqrf( A1, T1, D1,
chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans,
chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
}
......
......@@ -27,17 +27,13 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/**
* Parallel tile LQ factorization - dynamic scheduling
*/
void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -64,6 +60,7 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
......@@ -100,8 +97,7 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, A->mb, A->nb, A->nb,
......@@ -113,7 +109,6 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamLower, A->mb, A->nb,
0., 1.,
D(k), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -30,9 +30,9 @@
/*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -56,6 +56,7 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
......@@ -108,8 +109,7 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkm, temppn, ib, T->nb,
A( k, p), ldak,
T(k, p), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkm, temppn, A->nb,
......@@ -121,7 +121,6 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamLower, tempkm, temppn,
0., 1.,
D(k, p), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+A->nt
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((n)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -59,13 +55,16 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgelqt = A->nb * (ib+1)
* zunmlq = A->nb * ib
* ztslqt = A->nb * (ib+1)
* zttlqt = A->nb * (ib+1)
* ztsmlq = A->nb * ib
* zttmlq = A->nb * ib
* ztplqt = A->nb * (ib+1)
* ztpmlq = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
......@@ -74,7 +73,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -100,8 +99,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempkm, tempNn, ib, T->nb,
A(k, N), ldak,
T(k, N), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkm, tempNn, A->nb,
......@@ -113,7 +111,6 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ChamLower, tempkm, tempNn,
0., 1.,
D(k, N), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -27,17 +27,13 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/**
* Parallel tile QR factorization - dynamic scheduling
*/
void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -59,6 +55,7 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
......@@ -95,8 +92,7 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, A->mb, A->nb, A->nb,
......@@ -108,7 +104,6 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamUpper, A->mb, A->nb,
0., 1.,
D(k), ldak );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -29,7 +29,7 @@
/**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
{
......@@ -40,7 +40,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
size_t ws_host = 0;
int k, m, n, i, p;
int K, L;
int K, L, nbgeqrt;
int ldap, ldam;
int tempkmin, tempkn, tempnn, tempmm;
int ib;
......@@ -55,24 +55,22 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1)
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpqrt = A->nb * (ib+1)
* ztpmqrt = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
/* Worker space
*
/*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqrt = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -93,7 +91,8 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
/* The number of geqrt to apply */
for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) {
nbgeqrt = qrtree->getnbgeqrf(qrtree, k);
for (i = 0; i < nbgeqrt; i++) {
m = qrtree->getm(qrtree, k, i);
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn);
......@@ -106,8 +105,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempmm, tempkn, ib, T->nb,
A(m, k), ldam,
T(m, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempmm, A->nb, A->nb,
......@@ -119,7 +117,6 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamUpper, tempmm, A->nb,
0., 1.,
D(m, k), ldam );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), ((n)+A->nt)
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((m)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -59,13 +55,16 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1)
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpqrt = A->nb * (ib+1)
* ztpmqr = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
......@@ -74,7 +73,7 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -99,8 +98,7 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempMm, tempkn, ib, T->nb,
A(M, k), ldaM,
T(M, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempMm, A->nb, A->nb,
......@@ -112,7 +110,6 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ChamUpper, tempMm, A->nb,
0., 1.,
D(M, k), ldaM );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -26,16 +26,12 @@
#define T2(m,n) T2, m, n
#define Q1(m,n) Q1, m, n
#define Q2(m,n) Q2, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) V1, k, k
#endif
#define D(k) D, k, k
/**
* Parallel tile QR factorization - dynamic scheduling
*/
void chameleon_pztpgqrt( int L,
void chameleon_pztpgqrt( int genD, int L,
CHAM_desc_t *V1, CHAM_desc_t *T1,
CHAM_desc_t *V2, CHAM_desc_t *T2,
CHAM_desc_t *Q1, CHAM_desc_t *Q2,
......@@ -64,6 +60,12 @@ void chameleon_pztpgqrt( int L,
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = V1;
genD = 0;
}
/*
* ztpmqrt = Q1->nb * ib
*/
......@@ -132,7 +134,7 @@ void chameleon_pztpgqrt( int L,
}
}
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempkm, tempkk, V1->nb,
......@@ -145,7 +147,7 @@ void chameleon_pztpgqrt( int L,
0., 1.,
D(k), ldvk );
#endif
#endif
}
for (n = k; n < Q1->nt; n++) {
tempnn = n == Q1->nt-1 ? Q1->n-n*Q1->nb : Q1->nb;
INSERT_TASK_zunmqr(
......
......@@ -45,8 +45,9 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T,
int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1);
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS)
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
......
......@@ -28,17 +28,13 @@
#define A(m,n) A, m, n
#define Q(m,n) Q, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/**
* Parallel construction of Q using tile V (application to identity) - dynamic scheduling
*/
void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -64,13 +60,14 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
minMT = A->mt;
}
if (D == NULL) {
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmlq = A->nb * ib
* ztsmlq = A->nb * ib
* ztpmlq = A->nb * ib
*/
ws_worker = A->nb * ib;
......@@ -79,7 +76,7 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
/* Worker space
*
* zunmlq = A->nb * ib
* ztsmlq = 2 * A->nb * ib
* ztpmlq = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -119,7 +116,9 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
RUNTIME_data_flush( sequence, A(k, n) );
RUNTIME_data_flush( sequence, T(k, n) );
}
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkn, A->nb,
......@@ -132,7 +131,7 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
0., 1.,
D(k), ldak );
#endif
#endif
}
for (m = k; m < Q->mt; m++) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
ldqm = BLKLDD(Q, m);
......
......@@ -29,9 +29,9 @@
/**
* Parallel construction of Q using tile V - dynamic scheduling
*/
void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q,
void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -57,10 +57,14 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
D = A;
}
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmqr = A->nb * ib
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpmqr = A->nb * ib
*/
ws_worker = A->nb * ib;
......@@ -68,7 +72,7 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -138,7 +142,7 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb;
tempkmin = chameleon_min(tempkm, temppn);
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, temppn, A->nb,
......@@ -151,7 +155,7 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
0., 1.,
D(k, p), ldak );
#endif
#endif
}
for (m = k; m < Q->mt; m++) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
ldqm = BLKLDD(Q, m);
......
......@@ -28,19 +28,16 @@
#define Q(m,n) Q, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+(A->nt)
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((n)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/**
* Parallel construction of Q using tile V (application to identity;
* reduction Householder) - dynamic scheduling
*/
void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *T, CHAM_desc_t *D, int BS,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzunglqrh( int genD, int BS,
CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -61,10 +58,14 @@ void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmqr = A->nb * ib
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpmqr = A->nb * ib
*/
ws_worker = A->nb * ib;
......@@ -72,7 +73,7 @@ void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -147,7 +148,8 @@ void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,
RUNTIME_data_flush( sequence, A(k, n) );
RUNTIME_data_flush( sequence, T(k, n) );
}
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempNn, A->nb,
......@@ -160,7 +162,7 @@ void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,
0., 1.,
D(k, N), ldak );
#endif
#endif