Commit ec70a1f6 authored by Mathieu Faverge's avatar Mathieu Faverge

Fix diagonal tile size pb

parent ddfe2c36
......@@ -49,7 +49,7 @@ if(NOT BUILDNAME)
if(CHAMELEON_SCHED_PARSEC)
set(BUILDNAME "${BUILDNAME}-PaRSEC")
endif(CHAMELEON_SCHED_STARPU)
endif(CHAMELEON_SCHED_PARSEC)
if(CHAMELEON_SIMULATION)
set(BUILDNAME "${BUILDNAME}-SimGrid")
......
......@@ -41,7 +41,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
size_t ws_host = 0;
int k, m, n;
int ldak, ldam;
int ldak, ldam, lddk;
int tempkm, tempkn, tempmm, tempnn;
int ib, minMNT;
......@@ -92,6 +92,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
INSERT_TASK_zgelqt(
&options,
tempkm, tempkn, ib, T->nb,
......@@ -100,15 +101,15 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, A->mb, A->nb, A->nb,
ChamUpper, tempkm, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, A->mb, A->nb,
ChamLower, tempkm, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (m = k+1; m < A->mt; m++) {
......@@ -118,7 +119,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
&options,
ChamRight, ChamConjTrans,
tempmm, tempkn, tempkn, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
A(m, k), ldam);
}
......
......@@ -42,7 +42,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p;
int K, L;
int ldak, ldam;
int ldak, ldam, lddk;
int tempkmin, tempkm, tempnn, tempmm, temppn;
int ib;
int *tiles;
......@@ -96,6 +96,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
T = TS;
/* The number of geqrt to apply */
......@@ -114,13 +115,13 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options,
ChamUpper, tempkm, temppn, A->nb,
A(k, p), ldak,
D(k, p), ldak );
D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkm, temppn,
0., 1.,
D(k, p), ldak );
D(k, p), lddk );
#endif
}
for (m = k+1; m < A->mt; m++) {
......@@ -130,7 +131,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options,
ChamRight, ChamConjTrans,
tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak,
D(k, p), lddk,
T(k, p), T->mb,
A(m, p), ldam);
}
......
......@@ -44,7 +44,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
int k, m, n;
int K, N, RD;
int ldak, ldam;
int ldak, ldam, lddk;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib;
......@@ -91,6 +91,8 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (N = k; N < A->nt; N += BS) {
tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
tempkmin = chameleon_min(tempkm, tempNn);
......@@ -104,13 +106,13 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options,
ChamUpper, tempkm, tempNn, A->nb,
A(k, N), ldak,
D(k, N), ldak );
D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkm, tempNn,
0., 1.,
D(k, N), ldak );
D(k, N), lddk );
#endif
}
for (m = k+1; m < A->mt; m++) {
......@@ -120,7 +122,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options,
ChamRight, ChamConjTrans,
tempmm, tempNn, tempkmin, ib, T->nb,
D(k, N), ldak,
D(k, N), lddk,
T(k, N), T->mb,
A(m, N), ldam);
}
......
......@@ -41,7 +41,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
size_t ws_host = 0;
int k, m, n;
int ldak, ldam;
int ldak, ldam, lddk;
int tempkm, tempkn, tempnn, tempmm;
int ib;
int minMNT = chameleon_min(A->mt, A->nt);
......@@ -87,6 +87,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
INSERT_TASK_zgeqrt(
&options,
tempkm, tempkn, ib, T->nb,
......@@ -95,15 +96,15 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, A->mb, A->nb, A->nb,
ChamLower, tempkm, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, A->mb, A->nb,
ChamUpper, tempkm, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (n = k+1; n < A->nt; n++) {
......@@ -112,7 +113,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
&options,
ChamLeft, ChamConjTrans,
tempkm, tempnn, tempkm, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
A(k, n), ldak);
}
......
......@@ -41,7 +41,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p;
int K, L, nbgeqrt;
int ldap, ldam;
int ldap, ldam, lddm;
int tempkmin, tempkn, tempnn, tempmm;
int ib;
int *tiles;
......@@ -97,6 +97,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
T = TS;
......@@ -108,15 +109,15 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempmm, A->nb, A->nb,
ChamLower, tempmm, tempkn, A->nb,
A(m, k), ldam,
D(m, k), ldam );
D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempmm, A->nb,
ChamUpper, tempmm, tempkn,
0., 1.,
D(m, k), ldam );
D(m, k), lddm );
#endif
}
for (n = k+1; n < A->nt; n++) {
......@@ -125,7 +126,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options,
ChamLeft, ChamConjTrans,
tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam,
D(m, k), lddm,
T(m, k), T->mb,
A(m, n), ldam);
}
......
......@@ -44,7 +44,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
int k, m, n;
int K, M, RD;
int ldaM, ldam, ldaMRD;
int ldaM, ldam, ldaMRD, lddM;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib;
......@@ -92,6 +92,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb;
tempkmin = chameleon_min(tempMm, tempkn);
ldaM = BLKLDD(A, M);
lddM = BLKLDD(D, M);
INSERT_TASK_zgeqrt(
&options,
......@@ -101,15 +102,15 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamLower, tempMm, A->nb, A->nb,
ChamLower, tempMm, tempkn, A->nb,
A(M, k), ldaM,
D(M, k), ldaM );
D(M, k), lddM );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempMm, A->nb,
ChamUpper, tempMm, tempkn,
0., 1.,
D(M, k), ldaM );
D(M, k), lddM );
#endif
}
for (n = k+1; n < A->nt; n++) {
......@@ -118,7 +119,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
&options,
ChamLeft, ChamConjTrans,
tempMm, tempnn, tempkmin, ib, T->nb,
D(M, k), ldaM,
D(M, k), lddM,
T(M, k), T->mb,
A(M, n), ldaM);
}
......
This diff is collapsed.
......@@ -44,7 +44,7 @@ void chameleon_pztpgqrt( int genD, int L,
size_t ws_host = 0;
int k, m, n;
int ldvk, ldvm;
int ldvk, ldvm, lddk;
int ldqk, ldqm;
int tempkm, tempkn, tempkk, tempnn, tempmm, templm;
int ib;
......@@ -92,6 +92,7 @@ void chameleon_pztpgqrt( int genD, int L,
tempkk = k == V1->nt-1 ? V1->n-k*V1->nb : V1->nb;
tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb;
ldvk = BLKLDD(V1, k);
lddk = BLKLDD(D, k);
ldqk = BLKLDD(Q1, k);
/* Equivalent to the tsmqr step on Q1,Q2 */
......@@ -139,13 +140,13 @@ void chameleon_pztpgqrt( int genD, int L,
&options,
ChamLower, tempkm, tempkk, V1->nb,
V1(k, k), ldvk,
D(k), ldvk );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempkm, tempkk,
0., 1.,
D(k), ldvk );
D(k), lddk );
#endif
}
for (n = k; n < Q1->nt; n++) {
......@@ -154,7 +155,7 @@ void chameleon_pztpgqrt( int genD, int L,
&options,
ChamLeft, ChamNoTrans,
tempkm, tempnn, tempkk, ib, T1->nb,
D(k), ldvk,
D(k), lddk,
T1(k, k), T1->mb,
Q1(k, n), ldqk);
}
......
......@@ -42,7 +42,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
size_t ws_host = 0;
int k, m, n;
int ldak, ldqm;
int ldak, ldqm, lddk;
int tempnn, tempmm, tempkmin, tempkn;
int tempAkm, tempAkn;
int ib, minMT;
......@@ -94,6 +94,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
tempkmin = chameleon_min( tempAkn, tempAkm );
tempkn = k == Q->nt-1 ? Q->n-k*Q->nb : Q->nb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (n = Q->nt-1; n > k; n--) {
tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
for (m = 0; m < Q->mt; m++) {
......@@ -117,19 +119,18 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
RUNTIME_data_flush( sequence, T(k, n) );
}
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (m = k; m < Q->mt; m++) {
......@@ -143,7 +144,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
&options,
ChamRight, ChamNoTrans,
tempmm, tempkn, tempkmin, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
Q(m, k), ldqm);
}
......
......@@ -41,7 +41,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
int k, m, n, i, p;
int K, L;
int ldak, ldqm;
int ldak, ldqm, lddk;
int tempkm, tempkmin, temppn, tempnn, tempmm;
int ib;
int *tiles;
......@@ -93,6 +93,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
/* Setting the order of the tiles*/
libhqr_walk_stepk(qrtree, k, tiles + (k+1));
......@@ -147,13 +148,13 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options,
ChamUpper, tempkmin, temppn, A->nb,
A(k, p), ldak,
D(k, p), ldak );
D(k, p), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, temppn,
0., 1.,
D(k, p), ldak );
D(k, p), lddk );
#endif
}
for (m = k; m < Q->mt; m++) {
......@@ -167,7 +168,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
&options,
ChamRight, ChamNoTrans,
tempmm, temppn, tempkmin, ib, T->nb,
D(k, p), ldak,
D(k, p), lddk,
T(k, p), T->mb,
Q(m, p), ldqm);
}
......
......@@ -46,8 +46,7 @@ void chameleon_pzunglqrh( int genD, int BS,
int k, m, n;
int K, N, RD, lastRD;
int ldak;
int ldqm;
int ldak, lddk, ldqm;
int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn;
int ib;
......@@ -89,6 +88,7 @@ void chameleon_pzunglqrh( int genD, int BS,
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
lastRD = 0;
for (RD = BS; RD < A->nt-k; RD *= 2)
lastRD = RD;
......@@ -154,13 +154,13 @@ void chameleon_pzunglqrh( int genD, int BS,
&options,
ChamUpper, tempkmin, tempNn, A->nb,
A(k, N), ldak,
D(k, N), ldak );
D(k, N), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempNn,
0., 1.,
D(k, N), ldak );
D(k, N), lddk );
#endif
}
for (m = k; m < Q->mt; m++) {
......@@ -175,7 +175,7 @@ void chameleon_pzunglqrh( int genD, int BS,
ChamRight, ChamNoTrans,
tempmm, tempNn,
tempkmin, ib, T->nb,
D(k, N), ldak,
D(k, N), lddk,
T(k, N), T->mb,
Q(m, N), ldqm);
}
......
......@@ -43,7 +43,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
size_t ws_host = 0;
int k, m, n;
int ldak, ldqk, ldam, ldqm;
int ldak, ldqk, ldam, ldqm, lddk;
int tempmm, tempnn, tempkmin, tempkm;
int tempAkm, tempAkn;
int ib, minMT;
......@@ -95,6 +95,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
tempkmin = chameleon_min( tempAkn, tempAkm );
tempkm = k == Q->mt-1 ? Q->m-k*Q->mb : Q->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
ldqk = BLKLDD(Q, k);
for (m = Q->mt - 1; m > k; m--) {
tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
......@@ -125,13 +126,13 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
&options,
ChamLower, tempkm, tempkmin, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempkm, tempkmin,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (n = k; n < Q->nt; n++) {
......@@ -145,7 +146,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
&options,
ChamLeft, ChamNoTrans,
tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
Q(k, n), ldqk);
}
......
......@@ -41,7 +41,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
size_t ws_host = 0;
int k, m, n, i, p, L;
int ldam, ldqm, ldqp;
int ldam, ldqm, ldqp, lddm;
int tempmm, tempnn, tempkmin, tempkn;
int ib, minMT;
int *tiles;
......@@ -145,6 +145,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
tempkmin = chameleon_min(tempmm, tempkn);
ldam = BLKLDD(A, m);
lddm = BLKLDD(D, m);
ldqm = BLKLDD(Q, m);
if ( genD ) {
......@@ -152,13 +153,13 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
&options,
ChamLower, tempmm, tempkmin, A->nb,
A(m, k), ldam,
D(m, k), ldam );
D(m, k), lddm );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempmm, tempkmin,
0., 1.,
D(m, k), ldam );
D(m, k), lddm );
#endif
}
......@@ -173,7 +174,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree,
&options,
ChamLeft, ChamNoTrans,
tempmm, tempnn, tempkmin, ib, T->nb,
D(m, k), ldam,
D(m, k), lddm,
T(m, k), T->mb,
Q(m, n), ldqm);
}
......
......@@ -48,7 +48,7 @@ void chameleon_pzungqrrh( int genD, int BS,
int k, m, n;
int K, M, RD, lastRD;
int ldaM, ldam, ldaMRD;
int ldaM, ldam, ldaMRD, lddM;
int ldqM, ldqm, ldqMRD;
int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin;
int ib;
......@@ -127,6 +127,7 @@ void chameleon_pzungqrrh( int genD, int BS,
tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb;
tempkmin = chameleon_min(tempMm, tempkn);
ldaM = BLKLDD(A, M);
lddM = BLKLDD(D, M);
ldqM = BLKLDD(Q, M);
for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
......@@ -160,13 +161,13 @@ void chameleon_pzungqrrh( int genD, int BS,
&options,
ChamLower, tempMm, tempkmin, A->nb,
A(M, k), ldaM,
D(M, k), ldaM );
D(M, k), lddM );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamUpper, tempMm, tempkmin,
0., 1.,
D(M, k), ldaM );
D(M, k), lddM );
#endif
}
for (n = k; n < Q->nt; n++) {
......@@ -181,7 +182,7 @@ void chameleon_pzungqrrh( int genD, int BS,
ChamLeft, ChamNoTrans,
tempMm, tempnn,
tempkmin, ib, T->nb,
D(M, k), ldaM,
D(M, k), lddM,
T(M, k), T->mb,
Q(M, n), ldqM);
}
......
......@@ -44,7 +44,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
size_t ws_host = 0;
int k, m, n;
int ldak, ldbk, ldbm;
int ldak, ldbk, ldbm, lddk;
int tempmm, tempnn, tempkn, tempkm, tempkmin;
int ib, minMT, minM;
......@@ -97,21 +97,24 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
RUNTIME_iteration_push(chamctxt, k);
tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
lddk = BLKLDD(D, k);
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkm, A->nb,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkm,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (n = 0; n < B->nt; n++) {
......@@ -120,7 +123,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options,
side, trans,
tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
B(k, n), ldbk);
}
......@@ -168,10 +171,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
for (k = minMT-1; k >= 0; k--) {
RUNTIME_iteration_push(chamctxt, k);
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb;
tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
lddk = BLKLDD(D, k);
for (m = B->mt-1; m > k; m--) {
tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
ldbm = BLKLDD(B, m);
......@@ -198,15 +204,15 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkm, A->nb,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkm,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (n = 0; n < B->nt; n++) {
......@@ -219,7 +225,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options,
side, trans,
tempkm, tempnn, tempkmin, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
B(k, n), ldbk);
}
......@@ -240,6 +246,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb;
tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
for (n = B->nt-1; n > k; n--) {
tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
for (m = 0; m < B->mt; m++) {
......@@ -268,13 +276,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );
D(k), lddk );
#endif
}
for (m = 0; m < B->mt; m++) {
......@@ -288,7 +296,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
&options,
side, trans,
tempmm, tempkn, tempkmin, ib, T->nb,
D(k), ldak,
D(k), lddk,
T(k, k), T->mb,
B(m, k), ldbm);
}
......@@ -309,18 +317,20 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb;
tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb;
ldak = BLKLDD(A, k);
lddk = BLKLDD(D, k);
if ( genD ) {
INSERT_TASK_zlacpy(
&options,
ChamUpper, tempkmin, tempkn, A->nb,
A(k, k), ldak,
D(k), ldak );
D(k), lddk );
#if defined(CHAMELEON_USE_CUDA)
INSERT_TASK_zlaset(
&options,
ChamLower, tempkmin, tempkn,
0., 1.,
D(k), ldak );