From 250e7c55dec577869a33c57dece49be722b752d1 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Tue, 8 Oct 2019 10:30:11 +0200 Subject: [PATCH] Hotfix - QR/LQ --- compute/pzgelqf.c | 6 +- compute/pzgelqf_param.c | 7 +- compute/pzgelqfrh.c | 7 +- compute/pzgeqrf.c | 6 +- compute/pzgeqrf_param.c | 7 +- compute/pzgeqrfrh.c | 7 +- compute/pzgetrf_incpiv.c | 11 +- compute/pzhetrd_he2hb.c | 51 ++-- compute/pzunglq.c | 8 +- compute/pzunglq_param.c | 9 +- compute/pzunglqrh.c | 6 +- compute/pzungqr.c | 6 +- compute/pzungqr_param.c | 5 +- compute/pzungqrrh.c | 5 +- compute/pzunmlq.c | 22 +- compute/pzunmlq_param.c | 20 +- compute/pzunmlqrh.c | 23 +- compute/pzunmqr.c | 27 ++- compute/pzunmqr_param.c | 28 ++- compute/pzunmqrrh.c | 24 +- compute/zgels.c | 253 +++++++++++++------- compute/zgels_param.c | 226 ++++++++++------- compute/zunmlq.c | 1 - compute/zunmqr.c | 1 - runtime/starpu/control/runtime_descriptor.c | 6 +- 25 files changed, 492 insertions(+), 280 deletions(-) diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index 64a962bba..a85cad0fd 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -100,15 +100,17 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D A(k, k), ldak, T(k, k), T->mb); if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkm, tempkn, A->nb, + ChamUpper, tempDkm, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkm, tempkn, + ChamLower, tempDkm, tempDkn, 0., 1., D(k), lddk ); #endif diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index a5284b7c9..511125da4 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -108,15 +108,18 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t T(k, p), T->mb); if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkm, temppn, A->nb, + ChamUpper, tempDkm, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkm, temppn, + ChamLower, tempDkm, tempDpn, 0., 1., D(k, p), lddk ); #endif diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index e4c50ca6a..8eb69f56a 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -103,15 +103,18 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM A(k, N), ldak, T(k, N), T->mb); if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkm, tempNn, A->nb, + ChamUpper, tempDkm, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkm, tempNn, + ChamLower, tempDkm, tempDNn, 0., 1., D(k, N), lddk ); #endif diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 6d6d3c60b..3f92fa392 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -95,15 +95,17 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D A(k, k), ldak, T(k, k), T->mb); if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkn, A->nb, + ChamLower, tempDkm, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkm, tempkn, + ChamUpper, tempDkm, tempDkn, 0., 1., D(k), lddk ); #endif diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index bb7955797..ae417f76e 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -110,15 +110,18 @@ void chameleon_pzgeqrf_param( int genD, int K, T(m, k), T->mb); if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempmm, tempkn, A->nb, + ChamLower, tempDmm, tempDkn, A->nb, A(m, k), ldam, D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempmm, tempkn, + ChamUpper, tempDmm, tempDkn, 0., 1., D(m, k), lddm ); #endif diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 7f235365d..1182e5bc1 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -101,15 +101,18 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM A(M, k), ldaM, T(M, k), T->mb); if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkn, A->nb, + ChamLower, tempDMm, tempDkn, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkn, + ChamUpper, tempDMm, tempDkn, 0., 1., D(M, k), lddM ); #endif diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c index e4a494c68..642970b62 100644 --- a/compute/pzgetrf_incpiv.c +++ b/compute/pzgetrf_incpiv.c @@ -49,7 +49,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i size_t ws_host = 0; int k, m, n; - int ldak, ldam; + int ldak, ldam, lddk; int tempkm, tempkn, tempmm, tempnn; int ib; int minMNT = chameleon_min(A->mt, A->nt); @@ -62,6 +62,10 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i ib = CHAMELEON_IB; + if ( D == NULL ) { + D = A; + } + /* * zgetrf_incpiv = 0 * zgessm = 0 @@ -81,6 +85,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); INSERT_TASK_zgetrf_incpiv( &options, tempkm, tempkn, ib, L->nb, @@ -95,7 +100,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i &options, ChamUpperLower, tempkm, tempkn, A->nb, A(k, k), ldak, - D(k), ldak); + D(k), lddk); #endif } @@ -106,7 +111,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i tempkm, tempnn, tempkm, ib, L->nb, IPIV(k, k), L(k, k), L->mb, - D(k), ldak, + D(k), lddk, A(k, n), ldak); } for (m = k+1; m < A->mt; m++) { diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c index 1ef361ea7..a5b1aeb3c 100644 --- a/compute/pzhetrd_he2hb.c +++ b/compute/pzhetrd_he2hb.c @@ -23,7 +23,7 @@ #define A(m, n) A, m, n #define T(m, n) T, m, n -#define D(k) &D, (k)-1, 0 +#define D(k) &D, (k)-1, 0 #define AT(k) &AT, k, 0 @@ -49,6 +49,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, int k, m, n, i, j; int ldak, ldak1, ldam, ldan, ldaj, ldai; + int lddk, lddk1, lddm, lddn, ldek, ldek1; int tempkm, tempkn, tempmm, tempnn, tempjj; int ib; @@ -98,12 +99,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD((&D), k); INSERT_TASK_zhe2ge(&options, uplo, tempkn, tempkn, ldak, A(k, k), ldak, - D(k), ldak); + D(k), lddk); } if (uplo == ChamLower) { @@ -113,6 +115,8 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n- k *A->nb : A->nb; ldak1 = BLKLDD(A, k+1); + lddk1 = BLKLDD((&D), k+1); + ldek1 = BLKLDD(E, k+1); INSERT_TASK_zgeqrt( &options, @@ -125,13 +129,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLower, tempkm, tempkn, A->nb, A(k+1, k), ldak1, - E(k+1, k), ldak1 ); + E(k+1, k), ldek1 ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkn, 0., 1., - E(k+1, k), ldak1 ); + E(k+1, k), ldek1 ); #endif #endif @@ -142,7 +146,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, tempkm, tempkm, ib, A->nb, E(k+1, k), ldak1, T(k+1, k), T->mb, - D(k+1), ldak1); + D(k+1), lddk1); /* RIGHT on the remaining tiles until the bottom */ for (m = k+2; m < A->mt ; m++) { @@ -152,7 +156,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamRight, ChamNoTrans, tempmm, A->nb, tempkm, ib, A->nb, - E(k+1, k), ldak1, + E(k+1, k), ldek1, T(k+1, k), T->mb, A(m, k+1), ldam); } @@ -160,6 +164,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (m = k+2; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m); + lddm = BLKLDD((&D), m); options.priority = 1; INSERT_TASK_ztsqrt( @@ -222,7 +227,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamConjTrans, A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb, - D(k+1), ldak1, + D(k+1), lddk1, A(m, k+1), ldam, A(m, k), ldam, T(m, k), T->mb); @@ -233,8 +238,8 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamConjTrans, A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb, - AT(m), ldak1, - D(m) , ldam, + AT(m), ldak1, + D(m), lddm, A(m, k), ldam, T(m, k), T->mb); @@ -243,7 +248,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamRight, ChamNoTrans, A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb, - D(k+1), ldak1, + D(k+1), lddk1, AT(m) , ldak1, A(m, k), ldam, T(m, k), T->mb); @@ -254,7 +259,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, ChamRight, ChamNoTrans, tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb, A(m, k+1), ldam, - D(m) , ldam, + D(m), lddm, A(m, k), ldam, T(m, k), T->mb); options.priority = 0; @@ -270,7 +275,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb; tempkm = k == A->mt-1 ? A->m- k *A->mb : A->mb; ldak = BLKLDD(A, k); + ldek = BLKLDD(E, k); ldak1 = BLKLDD(A, k+1); + lddk1 = BLKLDD((&D), k+1); INSERT_TASK_zgelqt( &options, tempkm, tempkn, ib, A->nb, @@ -282,13 +289,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamUpper, tempkm, tempkn, A->nb, A(k, k+1), ldak, - E(k, k+1), ldak ); + E(k, k+1), ldek ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkm, tempkn, 0., 1., - E(k, k+1), ldak ); + E(k, k+1), ldek ); #endif #endif @@ -297,9 +304,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamUpper, tempkn, tempkn, ib, A->nb, - E(k, k+1), ldak, + E(k, k+1), ldek, T(k, k+1), T->mb, - D(k+1), ldak1); + D(k+1), lddk1); /* LEFT on the remaining tiles until the left side */ for (n = k+2; n < A->nt ; n++) { @@ -308,7 +315,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->mb, tempnn, tempkn, ib, A->nb, - E(k, k+1), ldak, + E(k, k+1), ldek, T(k, k+1), T->mb, A(k+1, n ), ldak1); } @@ -316,6 +323,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (n = k+2; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; ldan = BLKLDD(A, n); + lddn = BLKLDD((&D), n); options.priority = 1; INSERT_TASK_ztslqt( &options, @@ -375,7 +383,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamRight, ChamConjTrans, A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb, - D(k+1), ldak1, + D(k+1), lddk1, A(k+1, n), ldak1, A(k, n), ldak, T(k, n), T->mb); @@ -386,7 +394,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, ChamRight, ChamConjTrans, tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb, AT(n), A->mb, - D(n), ldan, + D(n), lddn, A(k, n), ldak, T(k, n), T->mb); @@ -396,7 +404,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb, - D(k+1), ldak1, + D(k+1), lddk1, AT(n), A->mb, A(k, n), ldak, T(k, n), T->mb); @@ -408,7 +416,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, ChamLeft, ChamNoTrans, A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb, A(k+1, n), ldak1, - D(n), ldan, + D(n), lddn, A(k, n), ldak, T(k, n), T->mb); } @@ -422,10 +430,11 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD((&D), k); INSERT_TASK_zlacpy(&options, uplo, tempkn, tempkn, ldak, - D(k), ldak, + D(k), lddk, A(k, k), ldak); } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index 4ab201e7f..63c3697f1 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -99,7 +99,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T for (n = Q->nt-1; n > k; n--) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - for (m = 0; m < Q->mt; m++) { + for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; ldqm = BLKLDD(Q, m); @@ -121,15 +121,16 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T } if ( genD ) { + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkn, A->nb, + ChamUpper, tempkmin, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkn, + ChamLower, tempkmin, tempDkn, 0., 1., D(k), lddk ); #endif @@ -138,6 +139,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; ldqm = BLKLDD(Q, m); + /* Restore the original location of the tiles */ RUNTIME_data_migrate( sequence, Q(m, k), Q->get_rankof( Q, m, k ) ); diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index fbc164624..e90345ec7 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -53,10 +53,6 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ib = CHAMELEON_IB; - if (D == NULL) { - D = A; - } - if ( D == NULL ) { D = A; genD = 0; @@ -142,15 +138,16 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempkmin = chameleon_min(tempkm, temppn); if ( genD ) { + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, temppn, A->nb, + ChamUpper, tempkmin, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, temppn, + ChamLower, tempkmin, tempDpn, 0., 1., D(k, p), lddk ); #endif diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index f38171b7f..6bdfcabea 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -149,15 +149,17 @@ void chameleon_pzunglqrh( int genD, int BS, } if ( genD ) { + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempNn, A->nb, + ChamUpper, tempkmin, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempNn, + ChamLower, tempkmin, tempDNn, 0., 1., D(k, N), lddk ); #endif diff --git a/compute/pzungqr.c b/compute/pzungqr.c index 4964cbb0b..dda7b25a1 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -123,15 +123,17 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, } if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkm, tempkmin, + ChamUpper, tempDkm, tempkmin, 0., 1., D(k), lddk ); #endif diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index e70c16deb..346848b6f 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -143,15 +143,16 @@ void chameleon_pzungqr_param( int genD, int K, ldqm = BLKLDD(Q, m); if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempmm, tempkmin, A->nb, + ChamLower, tempDmm, tempkmin, A->nb, A(m, k), ldam, D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempmm, tempkmin, + ChamUpper, tempDmm, tempkmin, 0., 1., D(m, k), lddm ); #endif diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index efec2cadd..8310ae1ae 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -155,15 +155,16 @@ void chameleon_pzungqrrh( int genD, int BS, } if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkmin, + ChamUpper, tempDMm, tempkmin, 0., 1., D(M, k), lddM ); #endif diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index 1df5d42b1..0aa1054dc 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -98,22 +98,22 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, RUNTIME_iteration_push(chamctxt, k); tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); lddk = BLKLDD(D, k); if ( genD ) { + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkn, A->nb, + ChamUpper, tempkmin, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkn, + ChamLower, tempkmin, tempDkn, 0., 1., D(k), lddk ); #endif @@ -172,7 +172,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, for (k = minMT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); @@ -203,15 +202,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, RUNTIME_data_flush( sequence, T(k, m) ); } if ( genD ) { + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkn, A->nb, + ChamUpper, tempkmin, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkn, + ChamLower, tempkmin, tempDkn, 0., 1., D(k), lddk ); #endif @@ -273,15 +273,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, RUNTIME_data_flush( sequence, T(k, n) ); } if ( genD ) { + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkn, A->nb, + ChamUpper, tempkmin, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkn, + ChamLower, tempkmin, tempDkn, 0., 1., D(k), lddk ); #endif @@ -321,15 +322,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, lddk = BLKLDD(D, k); if ( genD ) { + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkn, A->nb, + ChamUpper, tempkmin, tempDkn, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkn, + ChamLower, tempkmin, tempDkn, 0., 1., D(k), lddk ); #endif diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index 087cc0f5c..fe122e0b4 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -106,15 +106,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ldbp = BLKLDD(B, p); if ( genD ) { + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, temppn, A->nb, + ChamUpper, tempkmin, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, temppn, + ChamLower, tempkmin, tempDpn, 0., 1., D(k, p), lddk ); #endif @@ -245,15 +246,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ldbp = BLKLDD(B, p); if ( genD ) { + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, temppn, A->nb, + ChamUpper, tempkmin, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, temppn, + ChamLower, tempkmin, tempDpn, 0., 1., D(k, p), lddk ); #endif @@ -341,15 +343,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkmin = chameleon_min(tempkm, temppn); if ( genD ) { + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, temppn, A->nb, + ChamUpper, tempkmin, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, temppn, + ChamLower, tempkmin, tempDpn, 0., 1., D(k, p), lddk ); #endif @@ -396,15 +399,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkmin = chameleon_min(tempkm, temppn); if ( genD ) { + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, temppn, A->nb, + ChamUpper, tempkmin, tempDpn, A->nb, A(k, p), ldak, D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, temppn, + ChamLower, tempkmin, tempDpn, 0., 1., D(k, p), lddk ); #endif diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 243469fcf..c3b7d0c1d 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -102,15 +102,17 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkmin = chameleon_min(tempkm,tempNn); ldbN = BLKLDD(B, N); if ( genD ) { + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempNn, A->nb, + ChamUpper, tempkmin, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempNn, + ChamLower, tempkmin, tempDNn, 0., 1., D(k, N), lddk ); #endif @@ -254,15 +256,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_data_flush( sequence, T(k, m) ); } if ( genD ) { + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempNn, A->nb, + ChamUpper, tempkmin, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempNn, + ChamLower, tempkmin, tempDNn, 0., 1., D(k, N), lddk ); #endif @@ -339,7 +342,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans node = B->get_rankof( B, m, n ); RUNTIME_data_migrate( sequence, B(m, N), node ); - RUNTIME_data_migrate( sequence, B(m, m), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -355,15 +358,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_data_flush( sequence, T(k, n) ); } if ( genD ) { + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempNn, A->nb, + ChamUpper, tempkmin, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempNn, + ChamLower, tempkmin, tempDNn, 0., 1., D(k, N), lddk ); #endif @@ -404,15 +408,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = chameleon_min(tempkm,tempNn); if ( genD ) { + int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempNn, A->nb, + ChamUpper, tempkmin, tempDNn, A->nb, A(k, N), ldak, D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempNn, + ChamLower, tempkmin, tempDNn, 0., 1., D(k, N), lddk ); #endif diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index 0da2a1fd3..c872ea0de 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -103,15 +103,17 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, lddk = BLKLDD(D, k); ldbk = BLKLDD(B, k); if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkm, tempkmin, + ChamUpper, tempDkm, tempkmin, 0., 1., D(k), lddk ); #endif @@ -175,6 +177,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); + lddk = BLKLDD(D, k); for (m = B->mt-1; m > k; m--) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldam = BLKLDD(A, m); @@ -204,13 +207,13 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, ChamLower, tempkm, tempkmin, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkmin, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -223,7 +226,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(k, n), ldbk); } @@ -244,6 +247,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb; tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); for (n = B->nt-1; n > k; n--) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; ldan = BLKLDD(A, n); @@ -273,13 +277,13 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, ChamLower, tempkn, tempkmin, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkn, tempkmin, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -293,7 +297,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(m, k), ldbm); } @@ -314,18 +318,19 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempkn, tempkmin, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkn, tempkmin, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -335,7 +340,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(m, k), ldbm); } diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index a771077fa..4b6696da4 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -106,15 +106,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ldbm = BLKLDD(B, m); if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempmm, tempkmin, A->nb, + ChamLower, tempDmm, tempkmin, A->nb, A(m, k), ldam, D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempmm, tempkmin, + ChamUpper, tempDmm, tempkmin, 0., 1., D(m, k), lddm ); #endif @@ -215,7 +217,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, L = tempmm; T = TT; } - for (n = k; n < B->nt; n++) { + for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; node = B->get_rankof( B, m, n ); @@ -246,15 +248,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ldbm = BLKLDD(B, m); if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempmm, tempkmin, A->nb, + ChamLower, tempDmm, tempkmin, A->nb, A(m, k), ldam, D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempmm, tempkmin, + ChamUpper, tempDmm, tempkmin, 0., 1., D(m, k), lddm ); #endif @@ -309,7 +313,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, } else { /* TT kernel */ - L = tempmm; + L = A->mb; T = TT; } @@ -344,15 +348,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, lddn = BLKLDD(D, n); if ( genD ) { + int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempnn, tempkmin, A->nb, + ChamLower, tempDnn, tempkmin, A->nb, A(n, k), ldan, D(n, k), lddn ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempnn, tempkmin, + ChamUpper, tempDnn, tempkmin, 0., 1., D(n, k), lddn ); #endif @@ -397,15 +403,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, lddn = BLKLDD(D, n); if ( genD ) { + int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempnn, tempkmin, A->nb, + ChamLower, tempDnn, tempkmin, A->nb, A(n, k), ldan, D(n, k), lddn ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempnn, tempkmin, + ChamUpper, tempDnn, tempkmin, 0., 1., D(n, k), lddn ); #endif diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 02a0f2b8e..7ba65c846 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -102,15 +102,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans lddM = BLKLDD(D, M); ldbM = BLKLDD(B, M); if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkmin, + ChamUpper, tempDMm, tempkmin, 0., 1., D(M, k), lddM ); #endif @@ -254,15 +256,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_data_flush( sequence, T(m, k) ); } if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkmin, + ChamUpper, tempDMm, tempkmin, 0., 1., D(M, k), lddM ); #endif @@ -353,15 +357,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_data_flush( sequence, T(n, k) ); } if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkmin, + ChamUpper, tempDMm, tempkmin, 0., 1., D(M, k), lddM ); #endif @@ -401,15 +407,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans ldaM = BLKLDD(A, M); lddM = BLKLDD(D, M); if ( genD ) { + int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A->nb, A(M, k), ldaM, D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, tempkmin, + ChamUpper, tempDMm, tempkmin, 0., 1., D(M, k), lddM ); #endif diff --git a/compute/zgels.c b/compute/zgels.c index 2aed98e7d..b132610a8 100644 --- a/compute/zgels.c +++ b/compute/zgels.c @@ -12,8 +12,6 @@ * @brief Chameleon zgels wrappers * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak * @author Mathieu Faverge * @author Emmanuel Agullo @@ -26,22 +24,24 @@ #include <stdlib.h> /** + ******************************************************************************* * * @ingroup CHAMELEON_Complex64_t * - * CHAMELEON_zgels - solves overdetermined or underdetermined linear systems involving an M-by-N - * matrix A using the QR or the LQ factorization of A. It is assumed that A has full rank. - * The following options are provided: + * CHAMELEON_zgels - solves overdetermined or underdetermined linear systems + * involving an M-by-N matrix A using the QR or the LQ factorization of A. It + * is assumed that A has full rank. The following options are provided: * - * # trans = ChamNoTrans and M >= N: find the least squares solution of an overdetermined - * system, i.e., solve the least squares problem: minimize || B - A*X ||. + * # trans = ChamNoTrans and M >= N: find the least squares solution of an + * overdetermined system, i.e., solve the least squares problem: minimize + * || B - A*X ||. * - * # trans = ChamNoTrans and M < N: find the minimum norm solution of an underdetermined - * system A * X = B. + * # trans = ChamNoTrans and M < N: find the minimum norm solution of an + * underdetermined system A * X = B. * - * Several right hand side vectors B and solution vectors X can be handled in a single call; - * they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS - * solution matrix X. + * Several right hand side vectors B and solution vectors X can be handled in a + * single call; they are stored as the columns of the M-by-NRHS right hand side + * matrix B and the N-by-NRHS solution matrix X. * ******************************************************************************* * @@ -49,7 +49,6 @@ * Intended usage: * = ChamNoTrans: the linear system involves A; * = ChamConjTrans: the linear system involves A^H. - * Currently only ChamNoTrans is supported. * * @param[in] M * The number of rows of the matrix A. M >= 0. @@ -58,16 +57,15 @@ * The number of columns of the matrix A. N >= 0. * * @param[in] NRHS - * The number of right hand sides, i.e., the number of columns of the matrices B and X. - * NRHS >= 0. + * The number of right hand sides, i.e., the number of columns of the + * matrices B and X. NRHS >= 0. * * @param[in,out] A * On entry, the M-by-N matrix A. - * On exit, - * if M >= N, A is overwritten by details of its QR factorization as returned by - * CHAMELEON_zgeqrf; - * if M < N, A is overwritten by details of its LQ factorization as returned by - * CHAMELEON_zgelqf. + * On exit, if M >= N, A is overwritten by details of its QR + * factorization as returned by CHAMELEON_zgeqrf; if M < N, A is + * overwritten by details of its LQ factorization as returned by + * CHAMELEON_zgelqf. * * @param[in] LDA * The leading dimension of the array A. LDA >= max(1,M). @@ -76,13 +74,14 @@ * On exit, auxiliary factorization data. * * @param[in,out] B - * On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise; - * On exit, if return value = 0, B is overwritten by the solution vectors, stored - * columnwise: - * if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual - * sum of squares for the solution in each column is given by the sum of squares of the - * modulus of elements N+1 to M in that column; - * if M < N, rows 1 to N of B contain the minimum norm solution vectors; + * On entry, the M-by-NRHS matrix B of right hand side vectors, stored + * columnwise; + * On exit, if return value = 0, B is overwritten by the solution + * vectors, stored columnwise: if M >= N, rows 1 to N of B contain the + * least squares solution vectors; the residual sum of squares for the + * solution in each column is given by the sum of squares of the + * modulus of elements N+1 to M in that column; if M < N, rows 1 to N + * of B contain the minimum norm solution vectors; * * @param[in] LDB * The leading dimension of the array B. LDB >= MAX(1,M,N). @@ -102,9 +101,9 @@ * */ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, - CHAMELEON_Complex64_t *A, int LDA, - CHAM_desc_t *descT, - CHAMELEON_Complex64_t *B, int LDB ) + CHAMELEON_Complex64_t *A, int LDA, + CHAM_desc_t *descT, + CHAMELEON_Complex64_t *B, int LDB ) { int i, j; int NB; @@ -122,8 +121,8 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, } /* Check input arguments */ - if (trans != ChamNoTrans) { - chameleon_error("CHAMELEON_zgels", "only ChamNoTrans supported"); + if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) { + chameleon_error("CHAMELEON_zgels", "illegal value of trans"); return CHAMELEON_ERR_NOT_SUPPORTED; } if (M < 0) { @@ -169,15 +168,14 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, /* Submit the matrix conversion */ if ( M >= N ) { chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, - B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request ); + B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request ); } else { - /* Submit the matrix conversion */ chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, - B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); + B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); } /* Call the tile interface */ @@ -185,9 +183,9 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, - ChamDescInout, ChamUpperLower, sequence, &request ); + ChamDescInout, ChamUpperLower, sequence, &request ); chameleon_ztile2lap( chamctxt, &descBl, &descBt, - ChamDescInout, ChamUpperLower, sequence, &request ); + ChamDescInout, ChamUpperLower, sequence, &request ); CHAMELEON_Desc_Flush( descT, sequence ); chameleon_sequence_wait( chamctxt, sequence ); @@ -206,8 +204,9 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, * * @ingroup CHAMELEON_Complex64_t_Tile * - * CHAMELEON_zgels_Tile - Solves overdetermined or underdetermined linear system of equations - * using the tile QR or the tile LQ factorization. + * CHAMELEON_zgels_Tile - solves overdetermined or underdetermined linear + * systems involving an M-by-N matrix A using the QR or the LQ factorization of + * A. * Tile equivalent of CHAMELEON_zgels(). * Operates on matrices stored by tiles. * All matrices are passed through descriptors. @@ -219,27 +218,26 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, * Intended usage: * = ChamNoTrans: the linear system involves A; * = ChamConjTrans: the linear system involves A^H. - * Currently only ChamNoTrans is supported. * * @param[in,out] A * On entry, the M-by-N matrix A. - * On exit, - * if M >= N, A is overwritten by details of its QR factorization as returned by - * CHAMELEON_zgeqrf; - * if M < N, A is overwritten by details of its LQ factorization as returned by - * CHAMELEON_zgelqf. + * On exit, if M >= N, A is overwritten by details of its QR + * factorization as returned by CHAMELEON_zgeqrf; if M < N, A is + * overwritten by details of its LQ factorization as returned by + * CHAMELEON_zgelqf. * * @param[out] T * On exit, auxiliary factorization data. * * @param[in,out] B - * On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise; - * On exit, if return value = 0, B is overwritten by the solution vectors, stored - * columnwise: - * if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual - * sum of squares for the solution in each column is given by the sum of squares of the - * modulus of elements N+1 to M in that column; - * if M < N, rows 1 to N of B contain the minimum norm solution vectors; + * On entry, the M-by-NRHS matrix B of right hand side vectors, stored + * columnwise; + * On exit, if return value = 0, B is overwritten by the solution + * vectors, stored columnwise: if M >= N, rows 1 to N of B contain the + * least squares solution vectors; the residual sum of squares for the + * solution in each column is given by the sum of squares of the + * modulus of elements N+1 to M in that column; if M < N, rows 1 to N + * of B contain the minimum norm solution vectors; * ******************************************************************************* * @@ -255,7 +253,7 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS, * */ int CHAMELEON_zgels_Tile( cham_trans_t trans, CHAM_desc_t *A, - CHAM_desc_t *T, CHAM_desc_t *B ) + CHAM_desc_t *T, CHAM_desc_t *B ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; @@ -311,8 +309,8 @@ int CHAMELEON_zgels_Tile( cham_trans_t trans, CHAM_desc_t *A, * */ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, - CHAM_desc_t *T, CHAM_desc_t *B, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAM_desc_t *T, CHAM_desc_t *B, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_desc_t *subA; CHAM_desc_t *subB; @@ -341,35 +339,28 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, } /* Check descriptors for correctness */ + if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) { + chameleon_error("CHAMELEON_zgels_Tile_Async", "illegal value of trans"); + return CHAMELEON_ERR_NOT_SUPPORTED; + } if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { - chameleon_error("CHAMELEON_zgels_Tile", "invalid first descriptor"); + chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid first descriptor"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } if (chameleon_desc_check(T) != CHAMELEON_SUCCESS) { - chameleon_error("CHAMELEON_zgels_Tile", "invalid second descriptor"); + chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid second descriptor"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } if (chameleon_desc_check(B) != CHAMELEON_SUCCESS) { - chameleon_error("CHAMELEON_zgels_Tile", "invalid third descriptor"); + chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid third descriptor"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } /* Check input arguments */ if (A->nb != A->mb || B->nb != B->mb) { - chameleon_error("CHAMELEON_zgels_Tile", "only square tiles supported"); + chameleon_error("CHAMELEON_zgels_Tile_Async", "only square tiles supported"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } - if (trans != ChamNoTrans) { - chameleon_error("CHAMELEON_zgels_Tile", "only ChamNoTrans supported"); - return chameleon_request_fail(sequence, request, CHAMELEON_ERR_NOT_SUPPORTED); - } - /* Quick return - currently NOT equivalent to LAPACK's: - if (chameleon_min(M, chameleon_min(N, NRHS)) == 0) { - for (i = 0; i < chameleon_max(M, N); i++) - for (j = 0; j < NRHS; j++) - B[j*LDB+i] = 0.0; - return CHAMELEON_SUCCESS; - } - */ + if (A->m >= A->n) { #if defined(CHAMELEON_COPY_DIAG) { @@ -378,24 +369,67 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, Dptr = &D; } #endif + /* + * compute QR factorization of A + */ if (chamctxt->householder == ChamFlatHouseholder) { chameleon_pzgeqrf( 1, A, T, Dptr, sequence, request ); - - chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); } else { chameleon_pzgeqrfrh( 1, CHAMELEON_RHBLK, A, T, Dptr, sequence, request ); + } + + /* Perform the solve part */ + if ( trans == ChamNoTrans ) { + /* + * Least-Squares Problem min || A * X - B || + * + * B(1:M,1:NRHS) := Q**H * B(1:M,1:NRHS) + */ + if (chamctxt->householder == ChamFlatHouseholder) { + chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + } + else { + chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + } - chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + /* + * B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); + chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + } + else { + /* + * Underdetermined system of equations A**H * X = B + * + * B(1:N,1:NRHS) := inv(R**H) * B(1:N,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); + chameleon_pztrsm( ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + + /* + * B(M+1:N,1:NRHS) = 0 + */ + /* TODO: enable subdescriptor with non aligned starting point in laset */ + /* free(subB); */ + /* subB = chameleon_desc_submatrix(B, A->n, 0, A->m-A->n, B->n); */ + /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */ + + /* + * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS) + */ + if (chamctxt->householder == ChamFlatHouseholder) { + chameleon_pzunmqr( 0, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request ); + } + else { + chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request ); + } } - subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); - subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); - chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); } else { - /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n); - chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); - free(subB); */ #if defined(CHAMELEON_COPY_DIAG) { int m = chameleon_min(A->m, A->n); @@ -403,21 +437,64 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, Dptr = &D; } #endif + /* + * Compute LQ factorization of A + */ if (chamctxt->householder == ChamFlatHouseholder) { chameleon_pzgelqf( 1, A, T, Dptr, sequence, request ); } else { chameleon_pzgelqfrh( 1, CHAMELEON_RHBLK, A, T, Dptr, sequence, request ); } - subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); - subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); - chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); - if (chamctxt->householder == ChamFlatHouseholder) { - chameleon_pzunmlq( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + /* Perform the solve part */ + if ( trans == ChamNoTrans ) { + /* + * Underdetermined system of equations A * X = B + * + * B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); + chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + + /* + * B(M+1:N,1:NRHS) = 0 + */ + /* TODO: enable subdescriptor with non aligned starting point in laset */ + /* free(subB); */ + /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n); */ + /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */ + + /* + * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS) + */ + if (chamctxt->householder == ChamFlatHouseholder) { + chameleon_pzunmlq( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + } + else { + chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + } } else { - chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request ); + /* + * Overdetermined system min || A**H * X - B || + * + * B(1:N,1:NRHS) := Q * B(1:N,1:NRHS) + */ + if (chamctxt->householder == ChamFlatHouseholder) { + chameleon_pzunmlq( 0, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request ); + } + else { + chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request ); + } + + /* + * B(1:M,1:NRHS) := inv(L**H) * B(1:M,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); + chameleon_pztrsm( ChamLeft, ChamLower, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); } } diff --git a/compute/zgels_param.c b/compute/zgels_param.c index 697e1b4ff..01999735f 100644 --- a/compute/zgels_param.c +++ b/compute/zgels_param.c @@ -26,19 +26,20 @@ * * @ingroup CHAMELEON_Complex64_t * - * CHAMELEON_zgels_param - solves overdetermined or underdetermined linear systems involving an M-by-N - * matrix A using the QR or the LQ factorization of A. It is assumed that A has full rank. - * The following options are provided: + * CHAMELEON_zgels_param - solves overdetermined or underdetermined linear + * systems involving an M-by-N matrix A using the QR or the LQ factorization of + * A. It is assumed that A has full rank. The following options are provided: * - * # trans = ChamNoTrans and M >= N: find the least squares solution of an overdetermined - * system, i.e., solve the least squares problem: minimize || B - A*X ||. + * # trans = ChamNoTrans and M >= N: find the least squares solution of an + * overdetermined system, i.e., solve the least squares problem: minimize + * || B - A*X ||. * - * # trans = ChamNoTrans and M < N: find the minimum norm solution of an underdetermined - * system A * X = B. + * # trans = ChamNoTrans and M < N: find the minimum norm solution of an + * underdetermined system A * X = B. * - * Several right hand side vectors B and solution vectors X can be handled in a single call; - * they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS - * solution matrix X. + * Several right hand side vectors B and solution vectors X can be handled in a + * single call; they are stored as the columns of the M-by-NRHS right hand side + * matrix B and the N-by-NRHS solution matrix X. * ******************************************************************************* * @@ -49,7 +50,6 @@ * Intended usage: * = ChamNoTrans: the linear system involves A; * = ChamConjTrans: the linear system involves A^H. - * Currently only ChamNoTrans is supported. * * @param[in] M * The number of rows of the matrix A. M >= 0. @@ -58,16 +58,15 @@ * The number of columns of the matrix A. N >= 0. * * @param[in] NRHS - * The number of right hand sides, i.e., the number of columns of the matrices B and X. - * NRHS >= 0. + * The number of right hand sides, i.e., the number of columns of the + * matrices B and X. NRHS >= 0. * * @param[in,out] A * On entry, the M-by-N matrix A. - * On exit, - * if M >= N, A is overwritten by details of its QR factorization as returned by - * CHAMELEON_zgeqrf; - * if M < N, A is overwritten by details of its LQ factorization as returned by - * CHAMELEON_zgelqf. + * On exit, if M >= N, A is overwritten by details of its QR + * factorization as returned by CHAMELEON_zgeqrf; if M < N, A is + * overwritten by details of its LQ factorization as returned by + * CHAMELEON_zgelqf. * * @param[in] LDA * The leading dimension of the array A. LDA >= max(1,M). @@ -79,13 +78,14 @@ * On exit, auxiliary factorization data. * * @param[in,out] B - * On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise; - * On exit, if return value = 0, B is overwritten by the solution vectors, stored - * columnwise: - * if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual - * sum of squares for the solution in each column is given by the sum of squares of the - * modulus of elements N+1 to M in that column; - * if M < N, rows 1 to N of B contain the minimum norm solution vectors; + * On entry, the M-by-NRHS matrix B of right hand side vectors, stored + * columnwise; + * On exit, if return value = 0, B is overwritten by the solution + * vectors, stored columnwise: if M >= N, rows 1 to N of B contain the + * least squares solution vectors; the residual sum of squares for the + * solution in each column is given by the sum of squares of the + * modulus of elements N+1 to M in that column; if M < N, rows 1 to N + * of B contain the minimum norm solution vectors; * * @param[in] LDB * The leading dimension of the array B. LDB >= MAX(1,M,N). @@ -105,9 +105,9 @@ * */ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int M, int N, int NRHS, - CHAMELEON_Complex64_t *A, int LDA, - CHAM_desc_t *descTS, CHAM_desc_t *descTT, - CHAMELEON_Complex64_t *B, int LDB ) + CHAMELEON_Complex64_t *A, int LDA, + CHAM_desc_t *descTS, CHAM_desc_t *descTT, + CHAMELEON_Complex64_t *B, int LDB ) { int i, j; int NB; @@ -125,8 +125,8 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int } /* Check input arguments */ - if (trans != ChamNoTrans) { - chameleon_error("CHAMELEON_zgels_param", "only ChamNoTrans supported"); + if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) { + chameleon_error("CHAMELEON_zgels_param", "illegal value of trans"); return CHAMELEON_ERR_NOT_SUPPORTED; } if (M < 0) { @@ -172,14 +172,14 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int /* Submit the matrix conversion */ if ( M >= N ) { chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, - B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request ); + B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request ); } else { chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower, - B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); + B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request ); } /* Call the tile interface */ @@ -187,9 +187,9 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, - ChamDescInout, ChamUpperLower, sequence, &request ); + ChamDescInout, ChamUpperLower, sequence, &request ); chameleon_ztile2lap( chamctxt, &descBl, &descBt, - ChamDescInout, ChamUpperLower, sequence, &request ); + ChamDescInout, ChamUpperLower, sequence, &request ); CHAMELEON_Desc_Flush( descTS, sequence ); CHAMELEON_Desc_Flush( descTT, sequence ); @@ -205,12 +205,13 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int } /** - ******************************************************************************* + ******************************************************************************** * * @ingroup CHAMELEON_Complex64_t_Tile * - * CHAMELEON_zgels_param_Tile - Solves overdetermined or underdetermined linear system of equations - * using the tile QR or the tile LQ factorization. + * CHAMELEON_zgels_param_Tile - solves overdetermined or underdetermined linear + * systems involving an M-by-N matrix A using the QR or the LQ factorization of + * A. * Tile equivalent of CHAMELEON_zgels_param(). * Operates on matrices stored by tiles. * All matrices are passed through descriptors. @@ -222,15 +223,13 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int * Intended usage: * = ChamNoTrans: the linear system involves A; * = ChamConjTrans: the linear system involves A^H. - * Currently only ChamNoTrans is supported. * * @param[in,out] A * On entry, the M-by-N matrix A. - * On exit, - * if M >= N, A is overwritten by details of its QR factorization as returned by - * CHAMELEON_zgeqrf; - * if M < N, A is overwritten by details of its LQ factorization as returned by - * CHAMELEON_zgelqf. + * On exit, if M >= N, A is overwritten by details of its QR + * factorization as returned by CHAMELEON_zgeqrf; if M < N, A is + * overwritten by details of its LQ factorization as returned by + * CHAMELEON_zgelqf. * * @param[out] TS * On exit, auxiliary factorization data. @@ -239,13 +238,14 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int * On exit, auxiliary factorization data. * * @param[in,out] B - * On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise; - * On exit, if return value = 0, B is overwritten by the solution vectors, stored - * columnwise: - * if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual - * sum of squares for the solution in each column is given by the sum of squares of the - * modulus of elements N+1 to M in that column; - * if M < N, rows 1 to N of B contain the minimum norm solution vectors; + * On entry, the M-by-NRHS matrix B of right hand side vectors, stored + * columnwise; + * On exit, if return value = 0, B is overwritten by the solution + * vectors, stored columnwise: if M >= N, rows 1 to N of B contain the + * least squares solution vectors; the residual sum of squares for the + * solution in each column is given by the sum of squares of the + * modulus of elements N+1 to M in that column; if M < N, rows 1 to N + * of B contain the minimum norm solution vectors; * ******************************************************************************* * @@ -261,7 +261,7 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int * */ int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans, CHAM_desc_t *A, - CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B ) + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; @@ -289,7 +289,7 @@ int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans, } /** - ******************************************************************************* + ******************************************************************************** * * @ingroup CHAMELEON_Complex64_t_Tile_Async * @@ -318,8 +318,8 @@ int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans, * */ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t trans, CHAM_desc_t *A, - CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_desc_t *subA; CHAM_desc_t *subB; @@ -348,6 +348,10 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t } /* Check descriptors for correctness */ + if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) { + chameleon_error("CHAMELEON_zgels_param_Tile_Async", "illegal value of trans"); + return CHAMELEON_ERR_NOT_SUPPORTED; + } if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { chameleon_error("CHAMELEON_zgels_param_Tile", "invalid first descriptor"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); @@ -369,20 +373,8 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t chameleon_error("CHAMELEON_zgels_param_Tile", "only square tiles supported"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } - if (trans != ChamNoTrans) { - chameleon_error("CHAMELEON_zgels_param_Tile", "only ChamNoTrans supported"); - return chameleon_request_fail(sequence, request, CHAMELEON_ERR_NOT_SUPPORTED); - } - /* Quick return - currently NOT equivalent to LAPACK's: - if (chameleon_min(M, chameleon_min(N, NRHS)) == 0) { - for (i = 0; i < chameleon_max(M, N); i++) - for (j = 0; j < NRHS; j++) - B[j*LDB+i] = 0.0; - return CHAMELEON_SUCCESS; - } - */ - if (A->m >= A->n) { + if (A->m >= A->n) { #if defined(CHAMELEON_COPY_DIAG) { int n = chameleon_min(A->m, A->n); @@ -390,14 +382,51 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t Dptr = &D; } #endif - - subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); - subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); - + /* + * compute QR factorization of A + */ chameleon_pzgeqrf_param( 1, A->nt, qrtree, A, TS, TT, Dptr, sequence, request ); - chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request ); - chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + + /* Perform the solve part */ + if ( trans == ChamNoTrans ) { + /* + * Least-Squares Problem min || A * X - B || + * + * B(1:M,1:NRHS) := Q**H * B(1:M,1:NRHS) + */ + chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request ); + + /* + * B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); + chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + } + else { + /* + * Underdetermined system of equations A**H * X = B + * + * B(1:N,1:NRHS) := inv(R**H) * B(1:N,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); + chameleon_pztrsm( ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + + /* + * B(M+1:N,1:NRHS) = 0 + */ + /* TODO: enable subdescriptor with non aligned starting point in laset */ + /* free(subB); */ + /* subB = chameleon_desc_submatrix(B, A->n, 0, A->m-A->n, B->n); */ + /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */ + + /* + * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS) + */ + chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamNoTrans, A, B, TS, TT, Dptr, sequence, request ); + } } else { #if defined(CHAMELEON_COPY_DIAG) @@ -407,13 +436,50 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t Dptr = &D; } #endif + /* + * Compute LQ factorization of A + */ + chameleon_pzgelqf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request ); - subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); - subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); + /* Perform the solve part */ + if ( trans == ChamNoTrans ) { + /* + * Underdetermined system of equations A * X = B + * + * B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); + chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); - chameleon_pzgelqf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request ); - chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); - chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request ); + /* + * B(M+1:N,1:NRHS) = 0 + */ + /* TODO: enable subdescriptor with non aligned starting point in laset */ + /* free(subB); */ + /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n); */ + /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */ + + /* + * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS) + */ + chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request ); + } + else { + /* + * Overdetermined system min || A**H * X - B || + * + * B(1:N,1:NRHS) := Q * B(1:N,1:NRHS) + */ + chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamNoTrans, A, B, TS, TT, Dptr, sequence, request ); + + /* + * B(1:M,1:NRHS) := inv(L**H) * B(1:M,1:NRHS) + */ + subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n); + subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m); + chameleon_pztrsm( ChamLeft, ChamLower, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); + } } free(subA); diff --git a/compute/zunmlq.c b/compute/zunmlq.c index bbc4a6a77..07c635a67 100644 --- a/compute/zunmlq.c +++ b/compute/zunmlq.c @@ -219,7 +219,6 @@ int CHAMELEON_zunmlq( cham_side_t side, cham_trans_t trans, int M, int N, int K, * Intended usage: * = ChamNoTrans: no transpose, apply Q; * = ChamConjTrans: conjugate transpose, apply Q^H. - * Currently only ChamConjTrans is supported. * * @param[in] A * Details of the LQ factorization of the original matrix A as returned by CHAMELEON_zgelqf. diff --git a/compute/zunmqr.c b/compute/zunmqr.c index 4139f579a..09387b79b 100644 --- a/compute/zunmqr.c +++ b/compute/zunmqr.c @@ -220,7 +220,6 @@ int CHAMELEON_zunmqr( cham_side_t side, cham_trans_t trans, int M, int N, int K, * Intended usage: * = ChamNoTrans: no transpose, apply Q; * = ChamConjTrans: conjugate transpose, apply Q^H. - * Currently only ChamConjTrans is supported. * * @param[in] A * Details of the QR factorization of the original matrix A as returned by CHAMELEON_zgeqrf. diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index df527bbf5..9c837e5b4 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -415,6 +415,7 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence, const CHAM_desc_t *A, int Am, int An, int new_rank ) { #if defined(HAVE_STARPU_MPI_DATA_MIGRATE) + int old_rank; starpu_data_handle_t *handle = (starpu_data_handle_t*)(A->schedopt); starpu_data_handle_t lhandle; handle += ((int64_t)(A->lmt) * (int64_t)An + (int64_t)Am); @@ -424,8 +425,11 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence, /* Register the data */ lhandle = RUNTIME_data_getaddr( A, Am, An ); } + old_rank = starpu_mpi_data_get_rank( lhandle ); - starpu_mpi_data_migrate( MPI_COMM_WORLD, lhandle, new_rank ); + if ( old_rank != new_rank ) { + starpu_mpi_data_migrate( MPI_COMM_WORLD, lhandle, new_rank ); + } (void)sequence; #else -- GitLab