Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 1d966aa5 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'hotfix/qr' into 'master'

Fix diagonal tile size pb

See merge request !118
parents ddfe2c36 ec70a1f6
Branches
Tags
1 merge request!118Fix diagonal tile size pb
Showing with 355 additions and 317 deletions
...@@ -49,7 +49,7 @@ if(NOT BUILDNAME) ...@@ -49,7 +49,7 @@ if(NOT BUILDNAME)
if(CHAMELEON_SCHED_PARSEC) if(CHAMELEON_SCHED_PARSEC)
set(BUILDNAME "${BUILDNAME}-PaRSEC") set(BUILDNAME "${BUILDNAME}-PaRSEC")
endif(CHAMELEON_SCHED_STARPU) endif(CHAMELEON_SCHED_PARSEC)
if(CHAMELEON_SIMULATION) if(CHAMELEON_SIMULATION)
set(BUILDNAME "${BUILDNAME}-SimGrid") set(BUILDNAME "${BUILDNAME}-SimGrid")
......
...@@ -41,7 +41,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -41,7 +41,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldam; int ldak, ldam, lddk;
int tempkm, tempkn, tempmm, tempnn; int tempkm, tempkn, tempmm, tempnn;
int ib, minMNT; int ib, minMNT;
...@@ -92,6 +92,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -92,6 +92,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
INSERT_TASK_zgelqt( INSERT_TASK_zgelqt(
&options, &options,
tempkm, tempkn, ib, T->nb, tempkm, tempkn, ib, T->nb,
...@@ -100,15 +101,15 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -100,15 +101,15 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
  • Developer

    This D changes the main legacy API considerably. I guess the intention is to have the TS and UM[QR/LQ] kernels run in parallel, right?

  • Author Owner

    It doesn't change anything. It has been here from the beginning of the GPU version :P

  • Author Owner

    And yes the idea was to remove the upper/lower dependency by creating the copy. With the GPUs it is mandatory to use GEMM instead of TRMM in the unm[qr/lq] kernels.

  • Please register or sign in to reply
&options, &options,
ChamUpper, A->mb, A->nb, A->nb, ChamUpper, tempkm, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, A->mb, A->nb, ChamLower, tempkm, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
...@@ -118,7 +119,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -118,7 +119,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
&options, &options,
ChamRight, ChamConjTrans, ChamRight, ChamConjTrans,
tempmm, tempkn, tempkn, ib, T->nb, tempmm, tempkn, tempkn, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
A(m, k), ldam); A(m, k), ldam);
} }
......
...@@ -42,7 +42,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -42,7 +42,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p; int k, m, n, i, p;
int K, L; int K, L;
int ldak, ldam; int ldak, ldam, lddk;
int tempkmin, tempkm, tempnn, tempmm, temppn; int tempkmin, tempkm, tempnn, tempmm, temppn;
int ib; int ib;
int *tiles; int *tiles;
...@@ -96,6 +96,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -96,6 +96,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
T = TS; T = TS;
/* The number of geqrt to apply */ /* The number of geqrt to apply */
...@@ -114,13 +115,13 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -114,13 +115,13 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options, &options,
ChamUpper, tempkm, temppn, A->nb, ChamUpper, tempkm, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkm, temppn, ChamLower, tempkm, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
...@@ -130,7 +131,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -130,7 +131,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options, &options,
ChamRight, ChamConjTrans, ChamRight, ChamConjTrans,
tempmm, temppn, tempkmin, ib, T->nb, tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), T->mb, T(k, p), T->mb,
A(m, p), ldam); A(m, p), ldam);
} }
......
...@@ -44,7 +44,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -44,7 +44,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
int k, m, n; int k, m, n;
int K, N, RD; int K, N, RD;
int ldak, ldam; int ldak, ldam, lddk;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib; int ib;
...@@ -91,6 +91,8 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -91,6 +91,8 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (N = k; N < A->nt; N += BS) { for (N = k; N < A->nt; N += BS) {
tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
tempkmin = chameleon_min(tempkm, tempNn); tempkmin = chameleon_min(tempkm, tempNn);
...@@ -104,13 +106,13 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -104,13 +106,13 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options, &options,
ChamUpper, tempkm, tempNn, A->nb, ChamUpper, tempkm, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkm, tempNn, ChamLower, tempkm, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (m = k+1; m < A->mt; m++) { for (m = k+1; m < A->mt; m++) {
...@@ -120,7 +122,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -120,7 +122,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options, &options,
ChamRight, ChamConjTrans, ChamRight, ChamConjTrans,
tempmm, tempNn, tempkmin, ib, T->nb, tempmm, tempNn, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
A(m, N), ldam); A(m, N), ldam);
} }
......
...@@ -41,7 +41,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -41,7 +41,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldam; int ldak, ldam, lddk;
int tempkm, tempkn, tempnn, tempmm; int tempkm, tempkn, tempnn, tempmm;
int ib; int ib;
int minMNT = chameleon_min(A->mt, A->nt); int minMNT = chameleon_min(A->mt, A->nt);
...@@ -87,6 +87,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -87,6 +87,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
INSERT_TASK_zgeqrt( INSERT_TASK_zgeqrt(
&options, &options,
tempkm, tempkn, ib, T->nb, tempkm, tempkn, ib, T->nb,
...@@ -95,15 +96,15 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -95,15 +96,15 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, A->mb, A->nb, A->nb, ChamLower, tempkm, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, A->mb, A->nb, ChamUpper, tempkm, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
...@@ -112,7 +113,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D ...@@ -112,7 +113,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
&options, &options,
ChamLeft, ChamConjTrans, ChamLeft, ChamConjTrans,
tempkm, tempnn, tempkm, ib, T->nb, tempkm, tempnn, tempkm, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
A(k, n), ldak); A(k, n), ldak);
} }
......
...@@ -41,7 +41,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -41,7 +41,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p; int k, m, n, i, p;
int K, L, nbgeqrt; int K, L, nbgeqrt;
int ldap, ldam; int ldap, ldam, lddm;
int tempkmin, tempkn, tempnn, tempmm; int tempkmin, tempkn, tempnn, tempmm;
int ib; int ib;
int *tiles; int *tiles;
...@@ -97,6 +97,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -97,6 +97,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn); tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m); ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
T = TS; T = TS;
...@@ -108,15 +109,15 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -108,15 +109,15 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempmm, A->nb, A->nb, ChamLower, tempmm, tempkn, A->nb,
A(m, k), ldam, A(m, k), ldam,
D(m, k), ldam ); D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempmm, A->nb, ChamUpper, tempmm, tempkn,
0., 1., 0., 1.,
D(m, k), ldam ); D(m, k), lddm );
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
...@@ -125,7 +126,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -125,7 +126,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options, &options,
ChamLeft, ChamConjTrans, ChamLeft, ChamConjTrans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam, D(m, k), lddm,
T(m, k), T->mb, T(m, k), T->mb,
A(m, n), ldam); A(m, n), ldam);
} }
......
...@@ -44,7 +44,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -44,7 +44,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
int k, m, n; int k, m, n;
int K, M, RD; int K, M, RD;
int ldaM, ldam, ldaMRD; int ldaM, ldam, ldaMRD, lddM;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib; int ib;
...@@ -92,6 +92,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -92,6 +92,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb;
tempkmin = chameleon_min(tempMm, tempkn); tempkmin = chameleon_min(tempMm, tempkn);
ldaM = BLKLDD(A, M); ldaM = BLKLDD(A, M);
lddM = BLKLDD(D, M);
INSERT_TASK_zgeqrt( INSERT_TASK_zgeqrt(
&options, &options,
...@@ -101,15 +102,15 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -101,15 +102,15 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempMm, A->nb, A->nb, ChamLower, tempMm, tempkn, A->nb,
A(M, k), ldaM, A(M, k), ldaM,
D(M, k), ldaM ); D(M, k), lddM );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempMm, A->nb, ChamUpper, tempMm, tempkn,
0., 1., 0., 1.,
D(M, k), ldaM ); D(M, k), lddM );
#endif #endif
} }
for (n = k+1; n < A->nt; n++) { for (n = k+1; n < A->nt; n++) {
...@@ -118,7 +119,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ...@@ -118,7 +119,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options, &options,
ChamLeft, ChamConjTrans, ChamLeft, ChamConjTrans,
tempMm, tempnn, tempkmin, ib, T->nb, tempMm, tempnn, tempkmin, ib, T->nb,
D(M, k), ldaM, D(M, k), lddM,
T(M, k), T->mb, T(M, k), T->mb,
A(M, n), ldaM); A(M, n), ldaM);
} }
......
This diff is collapsed.
...@@ -44,7 +44,7 @@ void chameleon_pztpgqrt( int genD, int L, ...@@ -44,7 +44,7 @@ void chameleon_pztpgqrt( int genD, int L,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldvk, ldvm; int ldvk, ldvm, lddk;
int ldqk, ldqm; int ldqk, ldqm;
int tempkm, tempkn, tempkk, tempnn, tempmm, templm; int tempkm, tempkn, tempkk, tempnn, tempmm, templm;
int ib; int ib;
...@@ -92,6 +92,7 @@ void chameleon_pztpgqrt( int genD, int L, ...@@ -92,6 +92,7 @@ void chameleon_pztpgqrt( int genD, int L,
tempkk = k == V1->nt-1 ? V1->n-k*V1->nb : V1->nb; tempkk = k == V1->nt-1 ? V1->n-k*V1->nb : V1->nb;
tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb; tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb;
ldvk = BLKLDD(V1, k); ldvk = BLKLDD(V1, k);
lddk = BLKLDD(D, k);
ldqk = BLKLDD(Q1, k); ldqk = BLKLDD(Q1, k);
/* Equivalent to the tsmqr step on Q1,Q2 */ /* Equivalent to the tsmqr step on Q1,Q2 */
...@@ -139,13 +140,13 @@ void chameleon_pztpgqrt( int genD, int L, ...@@ -139,13 +140,13 @@ void chameleon_pztpgqrt( int genD, int L,
&options, &options,
ChamLower, tempkm, tempkk, V1->nb, ChamLower, tempkm, tempkk, V1->nb,
V1(k, k), ldvk, V1(k, k), ldvk,
D(k), ldvk ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempkm, tempkk, ChamUpper, tempkm, tempkk,
0., 1., 0., 1.,
D(k), ldvk ); D(k), lddk );
#endif #endif
} }
for (n = k; n < Q1->nt; n++) { for (n = k; n < Q1->nt; n++) {
...@@ -154,7 +155,7 @@ void chameleon_pztpgqrt( int genD, int L, ...@@ -154,7 +155,7 @@ void chameleon_pztpgqrt( int genD, int L,
&options, &options,
ChamLeft, ChamNoTrans, ChamLeft, ChamNoTrans,
tempkm, tempnn, tempkk, ib, T1->nb, tempkm, tempnn, tempkk, ib, T1->nb,
D(k), ldvk, D(k), lddk,
T1(k, k), T1->mb, T1(k, k), T1->mb,
Q1(k, n), ldqk); Q1(k, n), ldqk);
} }
......
...@@ -42,7 +42,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T ...@@ -42,7 +42,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldqm; int ldak, ldqm, lddk;
int tempnn, tempmm, tempkmin, tempkn; int tempnn, tempmm, tempkmin, tempkn;
int tempAkm, tempAkn; int tempAkm, tempAkn;
int ib, minMT; int ib, minMT;
...@@ -94,6 +94,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T ...@@ -94,6 +94,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
tempkmin = chameleon_min( tempAkn, tempAkm ); tempkmin = chameleon_min( tempAkn, tempAkm );
tempkn = k == Q->nt-1 ? Q->n-k*Q->nb : Q->nb; tempkn = k == Q->nt-1 ? Q->n-k*Q->nb : Q->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (n = Q->nt-1; n > k; n--) { for (n = Q->nt-1; n > k; n--) {
tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
for (m = 0; m < Q->mt; m++) { for (m = 0; m < Q->mt; m++) {
...@@ -117,19 +119,18 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T ...@@ -117,19 +119,18 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
RUNTIME_data_flush( sequence, T(k, n) ); RUNTIME_data_flush( sequence, T(k, n) );
} }
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkmin, tempkn, A->nb, ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempkn, ChamLower, tempkmin, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (m = k; m < Q->mt; m++) { for (m = k; m < Q->mt; m++) {
...@@ -143,7 +144,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T ...@@ -143,7 +144,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
&options, &options,
ChamRight, ChamNoTrans, ChamRight, ChamNoTrans,
tempmm, tempkn, tempkmin, ib, T->nb, tempmm, tempkn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
Q(m, k), ldqm); Q(m, k), ldqm);
} }
......
...@@ -41,7 +41,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -41,7 +41,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p; int k, m, n, i, p;
int K, L; int K, L;
int ldak, ldqm; int ldak, ldqm, lddk;
int tempkm, tempkmin, temppn, tempnn, tempmm; int tempkm, tempkmin, temppn, tempnn, tempmm;
int ib; int ib;
int *tiles; int *tiles;
...@@ -93,6 +93,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -93,6 +93,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
/* Setting the order of the tiles*/ /* Setting the order of the tiles*/
libhqr_walk_stepk(qrtree, k, tiles + (k+1)); libhqr_walk_stepk(qrtree, k, tiles + (k+1));
...@@ -147,13 +148,13 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -147,13 +148,13 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options, &options,
ChamUpper, tempkmin, temppn, A->nb, ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, temppn, ChamLower, tempkmin, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (m = k; m < Q->mt; m++) { for (m = k; m < Q->mt; m++) {
...@@ -167,7 +168,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ...@@ -167,7 +168,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options, &options,
ChamRight, ChamNoTrans, ChamRight, ChamNoTrans,
tempmm, temppn, tempkmin, ib, T->nb, tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), T->mb, T(k, p), T->mb,
Q(m, p), ldqm); Q(m, p), ldqm);
} }
......
...@@ -46,8 +46,7 @@ void chameleon_pzunglqrh( int genD, int BS, ...@@ -46,8 +46,7 @@ void chameleon_pzunglqrh( int genD, int BS,
int k, m, n; int k, m, n;
int K, N, RD, lastRD; int K, N, RD, lastRD;
int ldak; int ldak, lddk, ldqm;
int ldqm;
int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn;
int ib; int ib;
...@@ -89,6 +88,7 @@ void chameleon_pzunglqrh( int genD, int BS, ...@@ -89,6 +88,7 @@ void chameleon_pzunglqrh( int genD, int BS,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
lastRD = 0; lastRD = 0;
for (RD = BS; RD < A->nt-k; RD *= 2) for (RD = BS; RD < A->nt-k; RD *= 2)
lastRD = RD; lastRD = RD;
...@@ -154,13 +154,13 @@ void chameleon_pzunglqrh( int genD, int BS, ...@@ -154,13 +154,13 @@ void chameleon_pzunglqrh( int genD, int BS,
&options, &options,
ChamUpper, tempkmin, tempNn, A->nb, ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempNn, ChamLower, tempkmin, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (m = k; m < Q->mt; m++) { for (m = k; m < Q->mt; m++) {
...@@ -175,7 +175,7 @@ void chameleon_pzunglqrh( int genD, int BS, ...@@ -175,7 +175,7 @@ void chameleon_pzunglqrh( int genD, int BS,
ChamRight, ChamNoTrans, ChamRight, ChamNoTrans,
tempmm, tempNn, tempmm, tempNn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
Q(m, N), ldqm); Q(m, N), ldqm);
} }
......
...@@ -43,7 +43,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, ...@@ -43,7 +43,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldqk, ldam, ldqm; int ldak, ldqk, ldam, ldqm, lddk;
int tempmm, tempnn, tempkmin, tempkm; int tempmm, tempnn, tempkmin, tempkm;
int tempAkm, tempAkn; int tempAkm, tempAkn;
int ib, minMT; int ib, minMT;
...@@ -95,6 +95,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, ...@@ -95,6 +95,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
tempkmin = chameleon_min( tempAkn, tempAkm ); tempkmin = chameleon_min( tempAkn, tempAkm );
tempkm = k == Q->mt-1 ? Q->m-k*Q->mb : Q->mb; tempkm = k == Q->mt-1 ? Q->m-k*Q->mb : Q->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
ldqk = BLKLDD(Q, k); ldqk = BLKLDD(Q, k);
for (m = Q->mt - 1; m > k; m--) { for (m = Q->mt - 1; m > k; m--) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
...@@ -125,13 +126,13 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, ...@@ -125,13 +126,13 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
&options, &options,
ChamLower, tempkm, tempkmin, A->nb, ChamLower, tempkm, tempkmin, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempkm, tempkmin, ChamUpper, tempkm, tempkmin,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (n = k; n < Q->nt; n++) { for (n = k; n < Q->nt; n++) {
...@@ -145,7 +146,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, ...@@ -145,7 +146,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
&options, &options,
ChamLeft, ChamNoTrans, ChamLeft, ChamNoTrans,
tempkm, tempnn, tempkmin, ib, T->nb, tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
Q(k, n), ldqk); Q(k, n), ldqk);
} }
......
...@@ -41,7 +41,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -41,7 +41,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n, i, p, L; int k, m, n, i, p, L;
int ldam, ldqm, ldqp; int ldam, ldqm, ldqp, lddm;
int tempmm, tempnn, tempkmin, tempkn; int tempmm, tempnn, tempkmin, tempkn;
int ib, minMT; int ib, minMT;
int *tiles; int *tiles;
...@@ -145,6 +145,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -145,6 +145,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn); tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m); ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
ldqm = BLKLDD(Q, m); ldqm = BLKLDD(Q, m);
if ( genD ) { if ( genD ) {
...@@ -152,13 +153,13 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -152,13 +153,13 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamLower, tempmm, tempkmin, A->nb, ChamLower, tempmm, tempkmin, A->nb,
A(m, k), ldam, A(m, k), ldam,
D(m, k), ldam ); D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempmm, tempkmin, ChamUpper, tempmm, tempkmin,
0., 1., 0., 1.,
D(m, k), ldam ); D(m, k), lddm );
#endif #endif
} }
...@@ -173,7 +174,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -173,7 +174,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamLeft, ChamNoTrans, ChamLeft, ChamNoTrans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam, D(m, k), lddm,
T(m, k), T->mb, T(m, k), T->mb,
Q(m, n), ldqm); Q(m, n), ldqm);
} }
......
...@@ -48,7 +48,7 @@ void chameleon_pzungqrrh( int genD, int BS, ...@@ -48,7 +48,7 @@ void chameleon_pzungqrrh( int genD, int BS,
int k, m, n; int k, m, n;
int K, M, RD, lastRD; int K, M, RD, lastRD;
int ldaM, ldam, ldaMRD; int ldaM, ldam, ldaMRD, lddM;
int ldqM, ldqm, ldqMRD; int ldqM, ldqm, ldqMRD;
int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin;
int ib; int ib;
...@@ -127,6 +127,7 @@ void chameleon_pzungqrrh( int genD, int BS, ...@@ -127,6 +127,7 @@ void chameleon_pzungqrrh( int genD, int BS,
tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb;
tempkmin = chameleon_min(tempMm, tempkn); tempkmin = chameleon_min(tempMm, tempkn);
ldaM = BLKLDD(A, M); ldaM = BLKLDD(A, M);
lddM = BLKLDD(D, M);
ldqM = BLKLDD(Q, M); ldqM = BLKLDD(Q, M);
for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) { for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
...@@ -160,13 +161,13 @@ void chameleon_pzungqrrh( int genD, int BS, ...@@ -160,13 +161,13 @@ void chameleon_pzungqrrh( int genD, int BS,
&options, &options,
ChamLower, tempMm, tempkmin, A->nb, ChamLower, tempMm, tempkmin, A->nb,
A(M, k), ldaM, A(M, k), ldaM,
D(M, k), ldaM ); D(M, k), lddM );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempMm, tempkmin, ChamUpper, tempMm, tempkmin,
0., 1., 0., 1.,
D(M, k), ldaM ); D(M, k), lddM );
#endif #endif
} }
for (n = k; n < Q->nt; n++) { for (n = k; n < Q->nt; n++) {
...@@ -181,7 +182,7 @@ void chameleon_pzungqrrh( int genD, int BS, ...@@ -181,7 +182,7 @@ void chameleon_pzungqrrh( int genD, int BS,
ChamLeft, ChamNoTrans, ChamLeft, ChamNoTrans,
tempMm, tempnn, tempMm, tempnn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(M, k), ldaM, D(M, k), lddM,
T(M, k), T->mb, T(M, k), T->mb,
Q(M, n), ldqM); Q(M, n), ldqM);
} }
......
...@@ -44,7 +44,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -44,7 +44,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldbk, ldbm; int ldak, ldbk, ldbm, lddk;
int tempmm, tempnn, tempkn, tempkm, tempkmin; int tempmm, tempnn, tempkn, tempkm, tempkmin;
int ib, minMT, minM; int ib, minMT, minM;
...@@ -97,21 +97,24 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -97,21 +97,24 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
RUNTIME_iteration_push(chamctxt, k); RUNTIME_iteration_push(chamctxt, k);
tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k); ldbk = BLKLDD(B, k);
lddk = BLKLDD(D, k);
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkmin, tempkm, A->nb, ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempkm, ChamLower, tempkmin, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -120,7 +123,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -120,7 +123,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
side, trans, side, trans,
tempkm, tempnn, tempkmin, ib, T->nb, tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
B(k, n), ldbk); B(k, n), ldbk);
} }
...@@ -168,10 +171,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -168,10 +171,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
for (k = minMT-1; k >= 0; k--) { for (k = minMT-1; k >= 0; k--) {
RUNTIME_iteration_push(chamctxt, k); RUNTIME_iteration_push(chamctxt, k);
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb;
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k); ldbk = BLKLDD(B, k);
lddk = BLKLDD(D, k);
for (m = B->mt-1; m > k; m--) { for (m = B->mt-1; m > k; m--) {
tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
ldbm = BLKLDD(B, m); ldbm = BLKLDD(B, m);
...@@ -198,15 +204,15 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -198,15 +204,15 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkmin, tempkm, A->nb, ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempkm, ChamLower, tempkmin, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -219,7 +225,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -219,7 +225,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
side, trans, side, trans,
tempkm, tempnn, tempkmin, ib, T->nb, tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
B(k, n), ldbk); B(k, n), ldbk);
} }
...@@ -240,6 +246,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -240,6 +246,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb; tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb;
tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb; tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (n = B->nt-1; n > k; n--) { for (n = B->nt-1; n > k; n--) {
tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -268,13 +276,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -268,13 +276,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
ChamUpper, tempkmin, tempkn, A->nb, ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempkn, ChamLower, tempkmin, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -288,7 +296,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -288,7 +296,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
side, trans, side, trans,
tempmm, tempkn, tempkmin, ib, T->nb, tempmm, tempkn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
B(m, k), ldbm); B(m, k), ldbm);
} }
...@@ -309,18 +317,20 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -309,18 +317,20 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb;
tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb; tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamUpper, tempkmin, tempkn, A->nb, ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempkn, ChamLower, tempkmin, tempkn,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -330,7 +340,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ...@@ -330,7 +340,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
side, trans, side, trans,
tempmm, tempkn, tempkmin, ib, T->nb, tempmm, tempkn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
B(m, k), ldbm); B(m, k), ldbm);
} }
......
...@@ -42,7 +42,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -42,7 +42,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n, i, p; int k, m, n, i, p;
int ldbm, ldak, ldbp; int ldbm, ldak, ldbp, lddk;
int tempnn, temppn, tempkmin, tempmm, tempkm; int tempnn, temppn, tempkmin, tempmm, tempkm;
int ib, K, L; int ib, K, L;
int *tiles; int *tiles;
...@@ -95,6 +95,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -95,6 +95,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
T = TS; T = TS;
for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) {
...@@ -109,13 +110,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -109,13 +110,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamUpper, tempkmin, temppn, A->nb, ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, temppn, ChamLower, tempkmin, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -124,7 +125,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -124,7 +125,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
temppn, tempnn, tempkmin, ib, T->nb, temppn, tempnn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), T->mb, T(k, p), T->mb,
B(p, n), ldbp); B(p, n), ldbp);
} }
...@@ -193,6 +194,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -193,6 +194,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
/* Setting the order of the tiles*/ /* Setting the order of the tiles*/
libhqr_walk_stepk(qrtree, k, tiles + (k+1)); libhqr_walk_stepk(qrtree, k, tiles + (k+1));
...@@ -249,13 +251,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -249,13 +251,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamUpper, tempkmin, temppn, A->nb, ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, temppn, ChamLower, tempkmin, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -268,7 +270,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -268,7 +270,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
temppn, tempnn, tempkmin, ib, T->nb, temppn, tempnn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), T->mb, T(k, p), T->mb,
B(p, n), ldbp); B(p, n), ldbp);
} }
...@@ -291,6 +293,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -291,6 +293,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
/* Setting the order of the tiles*/ /* Setting the order of the tiles*/
libhqr_walk_stepk(qrtree, k, tiles + (k+1)); libhqr_walk_stepk(qrtree, k, tiles + (k+1));
...@@ -345,13 +348,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -345,13 +348,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamUpper, tempkmin, temppn, A->nb, ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, temppn, ChamLower, tempkmin, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -365,7 +368,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -365,7 +368,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, temppn, tempkmin, ib, T->nb, tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), T->mb, T(k, p), T->mb,
B(m, p), ldbm); B(m, p), ldbm);
} }
...@@ -386,6 +389,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -386,6 +389,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
T = TS; T = TS;
for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) {
...@@ -399,13 +403,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -399,13 +403,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamUpper, tempkmin, temppn, A->nb, ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak, A(k, p), ldak,
D(k, p), ldak ); D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, temppn, ChamLower, tempkmin, temppn,
0., 1., 0., 1.,
D(k, p), ldak ); D(k, p), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -415,7 +419,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -415,7 +419,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, temppn, tempkmin, ib, T->nb, tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak, D(k, p), lddk,
T(k, p), TS->mb, T(k, p), TS->mb,
B(m, p), ldbm); B(m, p), ldbm);
} }
......
...@@ -47,7 +47,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -47,7 +47,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
int k, m, n; int k, m, n;
int K, N, RD, lastRD; int K, N, RD, lastRD;
int ldak, ldbN, ldbm, ldbNRD; int ldak, lddk, ldbN, ldbm, ldbNRD;
int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
int ib; int ib;
...@@ -95,6 +95,8 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -95,6 +95,8 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (N = k; N < A->nt; N += BS) { for (N = k; N < A->nt; N += BS) {
tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
tempkmin = chameleon_min(tempkm,tempNn); tempkmin = chameleon_min(tempkm,tempNn);
...@@ -104,13 +106,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -104,13 +106,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
&options, &options,
ChamUpper, tempkmin, tempNn, A->nb, ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempNn, ChamLower, tempkmin, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -120,7 +122,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -120,7 +122,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
side, trans, side, trans,
tempNn, tempnn, tempNn, tempnn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
B(N, n), ldbN); B(N, n), ldbN);
} }
...@@ -196,6 +198,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -196,6 +198,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
lastRD = 0; lastRD = 0;
for (RD = BS; RD < A->nt-k; RD *= 2) for (RD = BS; RD < A->nt-k; RD *= 2)
lastRD = RD; lastRD = RD;
...@@ -259,13 +262,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -259,13 +262,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
&options, &options,
ChamUpper, tempkmin, tempNn, A->nb, ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempNn, ChamLower, tempkmin, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -279,7 +282,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -279,7 +282,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
side, trans, side, trans,
tempNn, tempnn, tempNn, tempnn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
B(N, n), ldbN); B(N, n), ldbN);
} }
...@@ -300,6 +303,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -300,6 +303,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
lastRD = 0; lastRD = 0;
for (RD = BS; RD < A->nt-k; RD *= 2) for (RD = BS; RD < A->nt-k; RD *= 2)
lastRD = RD; lastRD = RD;
...@@ -361,13 +365,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -361,13 +365,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
&options, &options,
ChamUpper, tempkmin, tempNn, A->nb, ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempNn, ChamLower, tempkmin, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -382,7 +386,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -382,7 +386,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
side, trans, side, trans,
tempmm, tempNn, tempmm, tempNn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
B(m, N), ldbm); B(m, N), ldbm);
} }
...@@ -401,6 +405,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -401,6 +405,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (N = k; N < A->nt; N += BS) { for (N = k; N < A->nt; N += BS) {
tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
tempkmin = chameleon_min(tempkm,tempNn); tempkmin = chameleon_min(tempkm,tempNn);
...@@ -409,13 +414,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -409,13 +414,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
&options, &options,
ChamUpper, tempkmin, tempNn, A->nb, ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak, A(k, N), ldak,
D(k, N), ldak ); D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamLower, tempkmin, tempNn, ChamLower, tempkmin, tempNn,
0., 1., 0., 1.,
D(k, N), ldak ); D(k, N), lddk );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -426,7 +431,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ...@@ -426,7 +431,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
side, trans, side, trans,
tempmm, tempNn, tempmm, tempNn,
tempkmin, ib, T->nb, tempkmin, ib, T->nb,
D(k, N), ldak, D(k, N), lddk,
T(k, N), T->mb, T(k, N), T->mb,
B(m, N), ldbm); B(m, N), ldbm);
} }
......
...@@ -44,7 +44,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, ...@@ -44,7 +44,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n; int k, m, n;
int ldak, ldbk, ldam, ldan, ldbm; int ldak, ldbk, ldam, ldan, ldbm, lddk;
int tempkm, tempnn, tempkmin, tempmm, tempkn; int tempkm, tempnn, tempkmin, tempmm, tempkn;
int ib, minMT, minM; int ib, minMT, minM;
...@@ -99,19 +99,20 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, ...@@ -99,19 +99,20 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb;
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k); ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
ldbk = BLKLDD(B, k); ldbk = BLKLDD(B, k);
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempkm, tempkmin, A->nb, ChamLower, tempkm, tempkmin, A->nb,
A(k, k), ldak, A(k, k), ldak,
D(k), ldak ); D(k), lddk );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempkm, tempkmin, ChamUpper, tempkm, tempkmin,
0., 1., 0., 1.,
D(k), ldak ); D(k), lddk );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -120,7 +121,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, ...@@ -120,7 +121,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
&options, &options,
side, trans, side, trans,
tempkm, tempnn, tempkmin, ib, T->nb, tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak, D(k), lddk,
T(k, k), T->mb, T(k, k), T->mb,
B(k, n), ldbk); B(k, n), ldbk);
} }
......
...@@ -42,7 +42,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -42,7 +42,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
size_t ws_host = 0; size_t ws_host = 0;
int k, m, n, i, p; int k, m, n, i, p;
int ldam, ldan, ldbm, ldbp; int ldam, ldan, ldbm, ldbp, lddn, lddm;
int tempnn, tempkmin, tempmm, tempkn; int tempnn, tempkmin, tempmm, tempkn;
int ib, K, L; int ib, K, L;
int *tiles; int *tiles;
...@@ -102,6 +102,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -102,6 +102,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn); tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m); ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
ldbm = BLKLDD(B, m); ldbm = BLKLDD(B, m);
if ( genD ) { if ( genD ) {
...@@ -109,13 +110,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -109,13 +110,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamLower, tempmm, tempkmin, A->nb, ChamLower, tempmm, tempkmin, A->nb,
A(m, k), ldam, A(m, k), ldam,
D(m, k), ldam ); D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempmm, tempkmin, ChamUpper, tempmm, tempkmin,
0., 1., 0., 1.,
D(m, k), ldam ); D(m, k), lddm );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -124,7 +125,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -124,7 +125,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam, D(m, k), lddm,
T(m, k), T->mb, T(m, k), T->mb,
B(m, n), ldbm); B(m, n), ldbm);
} }
...@@ -243,6 +244,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -243,6 +244,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn); tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m); ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
ldbm = BLKLDD(B, m); ldbm = BLKLDD(B, m);
if ( genD ) { if ( genD ) {
...@@ -250,13 +252,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -250,13 +252,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
ChamLower, tempmm, tempkmin, A->nb, ChamLower, tempmm, tempkmin, A->nb,
A(m, k), ldam, A(m, k), ldam,
D(m, k), ldam ); D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempmm, tempkmin, ChamUpper, tempmm, tempkmin,
0., 1., 0., 1.,
D(m, k), ldam ); D(m, k), lddm );
#endif #endif
} }
for (n = 0; n < B->nt; n++) { for (n = 0; n < B->nt; n++) {
...@@ -269,7 +271,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -269,7 +271,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam, D(m, k), lddm,
T(m, k), T->mb, T(m, k), T->mb,
B(m, n), ldbm); B(m, n), ldbm);
} }
...@@ -342,19 +344,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -342,19 +344,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
tempkmin = chameleon_min(tempnn, tempkn); tempkmin = chameleon_min(tempnn, tempkn);
ldan = BLKLDD(A, n); ldan = BLKLDD(A, n);
lddn = BLKLDD(D, n);
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempnn, tempkmin, A->nb, ChamLower, tempnn, tempkmin, A->nb,
A(n, k), ldan, A(n, k), ldan,
D(n, k), ldan ); D(n, k), lddn );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempnn, tempkmin, ChamUpper, tempnn, tempkmin,
0., 1., 0., 1.,
D(n, k), ldan ); D(n, k), lddn );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -368,7 +371,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -368,7 +371,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(n, k), ldan, D(n, k), lddn,
T(n, k), T->mb, T(n, k), T->mb,
B(m, n), ldbm); B(m, n), ldbm);
} }
...@@ -394,19 +397,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -394,19 +397,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
tempkmin = chameleon_min(tempnn, tempkn); tempkmin = chameleon_min(tempnn, tempkn);
ldan = BLKLDD(A, n); ldan = BLKLDD(A, n);
lddn = BLKLDD(D, n);
if ( genD ) { if ( genD ) {
INSERT_TASK_zlacpy( INSERT_TASK_zlacpy(
&options, &options,
ChamLower, tempnn, tempkmin, A->nb, ChamLower, tempnn, tempkmin, A->nb,
A(n, k), ldan, A(n, k), ldan,
D(n, k), ldan ); D(n, k), lddn );
#if defined(CHAMELEON_USE_CUDA) #if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset( INSERT_TASK_zlaset(
&options, &options,
ChamUpper, tempnn, tempkmin, ChamUpper, tempnn, tempkmin,
0., 1., 0., 1.,
D(n, k), ldan ); D(n, k), lddn );
#endif #endif
} }
for (m = 0; m < B->mt; m++) { for (m = 0; m < B->mt; m++) {
...@@ -416,7 +420,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -416,7 +420,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
&options, &options,
side, trans, side, trans,
tempmm, tempnn, tempkmin, ib, T->nb, tempmm, tempnn, tempkmin, ib, T->nb,
D(n, k), ldan, D(n, k), lddn,
T(n, k), T->mb, T(n, k), T->mb,
B(m, n), ldbm); B(m, n), ldbm);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment