Commit 79436c1a authored by Mathieu Faverge's avatar Mathieu Faverge

Cleanup on QR/LQ algorithms to reuse D matrix is possible

parent f19a75ed
......@@ -20,8 +20,8 @@
*/
#include "control/common.h"
void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
int k;
int tempkm, tempkn;
......@@ -38,12 +38,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, D->m-k*D->mb, tempkn);
}
chameleon_pzgeqrf( A1, T1, D1,
sequence, request);
chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
if (k+1 < A->nt){
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
......@@ -55,12 +55,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, k*D->mb, (k+1)*D->nb, tempkm, D->n-(k+1)*D->nb);
}
chameleon_pzgelqf( A1, T1, D1,
sequence, request);
chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
}
}
}
......@@ -74,12 +74,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
if ( D != NULL ) {
D1 = chameleon_desc_submatrix(D, k*D->mb, k*D->nb, tempkm, D->n-k*D->nb);
}
chameleon_pzgelqf( A1, T1, D1,
sequence, request);
chameleon_pzgelqf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmlq( ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
chameleon_pzunmlq( 0, ChamRight, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
if (k+1 < A->mt){
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
......@@ -91,12 +91,12 @@ void chameleon_pzgebrd_ge2gb(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
D1 = chameleon_desc_submatrix(D, (k+1)*D->mb, k*D->nb, D->m-(k+1)*D->mb, tempkn);
}
chameleon_pzgeqrf( A1, T1, D1,
sequence, request);
chameleon_pzgeqrf( genD, A1, T1, D1,
sequence, request);
chameleon_pzunmqr( ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans,
A1, A2, T1, D1,
sequence, request);
}
}
}
......
......@@ -27,17 +27,13 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
#define D(k) D, k, k
/**
* Parallel tile LQ factorization - dynamic scheduling
*/
void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -63,7 +59,8 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
}
if ( D == NULL ) {
D = A;
D = A;
genD = 0;
}
/*
......@@ -100,8 +97,7 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, A->mb, A->nb, A->nb,
......@@ -113,7 +109,6 @@ void chameleon_pzgelqf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamLower, A->mb, A->nb,
0., 1.,
D(k), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -30,9 +30,9 @@
/*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -55,7 +55,8 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
D = A;
genD = 0;
}
/*
......@@ -108,8 +109,7 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkm, temppn, ib, T->nb,
A( k, p), ldak,
T(k, p), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkm, temppn, A->nb,
......@@ -121,7 +121,6 @@ void chameleon_pzgelqf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamLower, tempkm, temppn,
0., 1.,
D(k, p), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+A->nt
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((n)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/*
* Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -59,13 +55,16 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB;
/*
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgelqt = A->nb * (ib+1)
* zunmlq = A->nb * ib
* ztslqt = A->nb * (ib+1)
* zttlqt = A->nb * (ib+1)
* ztsmlq = A->nb * ib
* zttmlq = A->nb * ib
* ztplqt = A->nb * (ib+1)
* ztpmlq = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
......@@ -74,7 +73,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -100,8 +99,7 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempkm, tempNn, ib, T->nb,
A(k, N), ldak,
T(k, N), T->mb);
if ( k < (A->mt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkm, tempNn, A->nb,
......@@ -113,7 +111,6 @@ void chameleon_pzgelqfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ChamLower, tempkm, tempNn,
0., 1.,
D(k, N), ldak );
#endif
#endif
}
for (m = k+1; m < A->mt; m++) {
......
......@@ -27,17 +27,13 @@
#define A(m,n) A, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/**
* Parallel tile QR factorization - dynamic scheduling
*/
void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -58,7 +54,8 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
D = A;
genD = 0;
}
/*
......@@ -95,8 +92,7 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
tempkm, tempkn, ib, T->nb,
A(k, k), ldak,
T(k, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, A->mb, A->nb, A->nb,
......@@ -108,7 +104,6 @@ void chameleon_pzgeqrf(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
ChamUpper, A->mb, A->nb,
0., 1.,
D(k), ldak );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -29,9 +29,9 @@
/**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -40,7 +40,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
size_t ws_host = 0;
int k, m, n, i, p;
int K, L;
int K, L, nbgeqrt;
int ldap, ldam;
int tempkmin, tempkn, tempnn, tempmm;
int ib;
......@@ -54,25 +54,23 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
D = A;
genD = 0;
}
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1)
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztpqrt = A->nb * (ib+1)
* ztpmqrt = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
/*
* zunmqr = A->nb * ib
* ztpmqrt = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -93,7 +91,8 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
/* The number of geqrt to apply */
for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) {
nbgeqrt = qrtree->getnbgeqrf(qrtree, k);
for (i = 0; i < nbgeqrt; i++) {
m = qrtree->getm(qrtree, k, i);
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn);
......@@ -106,8 +105,7 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
tempmm, tempkn, ib, T->nb,
A(m, k), ldam,
T(m, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempmm, A->nb, A->nb,
......@@ -119,7 +117,6 @@ void chameleon_pzgeqrf_param( const libhqr_tree_t *qrtree, CHAM_desc_t *A,
ChamUpper, tempmm, A->nb,
0., 1.,
D(m, k), ldam );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -29,17 +29,13 @@
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), ((n)+A->nt)
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((m)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/**
* Parallel tile QR factorization (reduction Householder) - dynamic scheduling
*/
void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -59,13 +55,16 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1)
* zttqrt = A->nb * (ib+1)
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpqrt = A->nb * (ib+1)
* ztpmqr = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
......@@ -74,7 +73,7 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -99,20 +98,18 @@ void chameleon_pzgeqrfrh(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, int BS,
tempMm, tempkn, ib, T->nb,
A(M, k), ldaM,
T(M, k), T->mb);
if ( k < (A->nt-1) ) {
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy(
&options,
ChamLower, tempMm, A->nb, A->nb,
A(M, k), ldaM,
D(M, k), ldaM );
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempMm, A->nb, A->nb,
A(M, k), ldaM,
D(M, k), ldaM );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempMm, A->nb,
0., 1.,
D(M, k), ldaM );
#endif
#endif
}
for (n = k+1; n < A->nt; n++) {
......
......@@ -26,21 +26,17 @@
#define T2(m,n) T2, m, n
#define Q1(m,n) Q1, m, n
#define Q2(m,n) Q2, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) V1, k, k
#endif
#define D(k) D, k, k
/**
* Parallel tile QR factorization - dynamic scheduling
*/
void chameleon_pztpgqrt( int L,
CHAM_desc_t *V1, CHAM_desc_t *T1,
CHAM_desc_t *V2, CHAM_desc_t *T2,
CHAM_desc_t *Q1, CHAM_desc_t *Q2,
CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
void chameleon_pztpgqrt( int genD, int L,
CHAM_desc_t *V1, CHAM_desc_t *T1,
CHAM_desc_t *V2, CHAM_desc_t *T2,
CHAM_desc_t *Q1, CHAM_desc_t *Q2,
CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -64,6 +60,12 @@ void chameleon_pztpgqrt( int L,
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = V1;
genD = 0;
}
/*
* ztpmqrt = Q1->nb * ib
*/
......@@ -132,27 +134,27 @@ void chameleon_pztpgqrt( int L,
}
}
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy(
&options,
ChamLower, tempkm, tempkk, V1->nb,
V1(k, k), ldvk,
D(k), ldvk );
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempkm, tempkk, V1->nb,
V1(k, k), ldvk,
D(k), ldvk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempkm, tempkk,
0., 1.,
D(k), ldvk );
#endif
INSERT_TASK_zlaset(
&options,
ChamUpper, tempkm, tempkk,
0., 1.,
D(k), ldvk );
#endif
}
for (n = k; n < Q1->nt; n++) {
tempnn = n == Q1->nt-1 ? Q1->n-n*Q1->nb : Q1->nb;
INSERT_TASK_zunmqr(
&options,
ChamLeft, ChamNoTrans,
tempkm, tempnn, tempkk, ib, T1->nb,
D(k), ldvk,
D(k), ldvk,
T1(k, k), T1->mb,
Q1(k, n), ldqk);
}
......
......@@ -28,7 +28,7 @@
* Parallel tile QR factorization - dynamic scheduling
*/
void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -45,8 +45,9 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T,
int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1);
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS)
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
......
......@@ -28,17 +28,13 @@
#define A(m,n) A, m, n
#define Q(m,n) Q, m, n
#define T(m,n) T, m, n
#if defined(CHAMELEON_COPY_DIAG)
#define D(k) D, k, 0
#else
#define D(k) D, k, k
#endif
/**
* Parallel construction of Q using tile V (application to identity) - dynamic scheduling
*/
void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -64,13 +60,14 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
minMT = A->mt;
}
if (D == NULL) {
D = A;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmlq = A->nb * ib
* ztsmlq = A->nb * ib
* ztpmlq = A->nb * ib
*/
ws_worker = A->nb * ib;
......@@ -79,7 +76,7 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
/* Worker space
*
* zunmlq = A->nb * ib
* ztsmlq = 2 * A->nb * ib
* ztpmlq = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -119,20 +116,22 @@ void chameleon_pzunglq(CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T, CHAM_desc
RUNTIME_data_flush( sequence, A(k, n) );
RUNTIME_data_flush( sequence, T(k, n) );
}
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
#endif
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
#endif
}
for (m = k; m < Q->mt; m++) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
ldqm = BLKLDD(Q, m);
......
......@@ -29,9 +29,9 @@
/**
* Parallel construction of Q using tile V - dynamic scheduling
*/
void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
......@@ -57,10 +57,14 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
D = A;
}
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmqr = A->nb * ib
* ztsmqr = A->nb * ib
* zttmqr = A->nb * ib
* ztpmqr = A->nb * ib
*/
ws_worker = A->nb * ib;
......@@ -68,7 +72,7 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* ztpmqr = 2 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
#endif
......@@ -138,20 +142,20 @@ void chameleon_pzunglq_param(const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_d
temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb;
tempkmin = chameleon_min(tempkm, temppn);
#if defined(CHAMELEON_COPY_DIAG)
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak,
D(k, p), ldak );
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak,
D(k, p), ldak );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, temppn,
0., 1.,
D(k, p), ldak );
#endif
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, temppn,
0., 1.,
D(k, p), ldak );
#endif
}
for (m = k; m < Q->mt; m++) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
ldqm = BLKLDD(Q, m);
......
......@@ -24,23 +24,20 @@
*/
#include "control/common.h"
#define A(m,n) A, (m), (n)
#define Q(m,n) Q, (m), (n)
#define T(m,n) T, (m), (n)
#define A(m,n) A, (m), (n)
#define Q(m,n) Q, (m), (n)
#define T(m,n) T, (m), (n)
#define T2(m,n) T, (m), (n)+(A->nt)
#if defined(CHAMELEON_COPY_DIAG)
#define D(m,n) D, ((n)/BS), 0
#else
#define D(m,n) A, (m), (n)
#endif
#define D(m,n) D, (m), (n)
/**
* Parallel construction of Q using tile V (application to identity;
* reduction Householder) - dynamic scheduling
*/
void chameleon_pzunglqrh(CHAM_desc_t *A, CHAM_desc_t *Q,