From 52c188d947e4db2e76502bc2dda4739073dca8e8 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 17 Oct 2018 23:01:05 +0200 Subject: [PATCH] Factorize migrate calls, and fix tpm* kernels --- compute/pzgelqf_param.c | 63 +++++++++++++++----------------- compute/pzgelqfrh.c | 16 ++++---- compute/pzgeqrf_param.c | 53 ++++++++++++++------------- compute/pzgeqrfrh.c | 16 ++++---- compute/pztpgqrt.c | 65 ++------------------------------- compute/pztpqrt.c | 13 +++---- compute/pzunglq_param.c | 23 +++++------- compute/pzunglqrh.c | 16 ++++---- compute/pzungqr_param.c | 49 +++++++++++-------------- compute/pzungqrrh.c | 18 ++++----- compute/pzunmlq_param.c | 60 ++++++++++++++---------------- compute/pzunmlqrh.c | 58 +++++++++++++---------------- compute/pzunmqr_param.c | 62 +++++++++++++++---------------- compute/pzunmqrrh.c | 60 +++++++++++++----------------- compute/zgels_param.c | 3 +- compute/zgeqrf_param.c | 16 ++++++-- compute/ztpgqrt.c | 19 ++++++---- compute/zungqr_param.c | 21 ++++++++--- coreblas/compute/core_ztpmlqt.c | 2 +- coreblas/compute/core_ztpmqrt.c | 2 +- 20 files changed, 277 insertions(+), 358 deletions(-) diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index 9f853ef7a..cfc343eb2 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -26,8 +26,7 @@ #define T(m,n) T, (m), (n) #define D(m,n) D, (m), (n) - -/* +/** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling */ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, @@ -41,11 +40,10 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t size_t ws_host = 0; int k, m, n, i, p; - int K, L; + int K, L, nbgeqrt; int ldak, ldam, lddk; int tempkmin, tempkm, tempnn, tempmm, temppn; - int ib; - int *tiles; + int ib, node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -60,37 +58,32 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t } /* - * zgelqt = A->nb * (ib+1) - * zunmlq = A->nb * ib - * ztslqt = A->nb * (ib+1) - * zttlqt = A->nb * (ib+1) - * ztsmlq = A->nb * ib - * zttmlq = A->nb * ib + * zgelqt = A->nb * (ib+1) + * zunmlq = A->nb * ib + * ztplqt = A->nb * (ib+1) + * ztpmlqt = A->nb * ib */ ws_worker = A->nb * (ib+1); /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) - /* Worker space - * - * zunmlq = A->nb * ib - * ztsmlq = 2 * A->nb * ib + /* + * zunmqr = A->nb * ib + * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of tiles */ - - tiles = (int*)calloc(qrtree->mt, sizeof(int)); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* Initialisation of temporary tiles array */ + tiles = (int*)calloc(qrtree->mt, sizeof(int)); + K = chameleon_min(A->mt, A->nt); - /* The number of the factorization */ for (k = 0; k < K; k++) { RUNTIME_iteration_push(chamctxt, k); @@ -98,9 +91,11 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); - T = TS; /* The number of geqrt to apply */ - for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { + nbgeqrt = qrtree->getnbgeqrf(qrtree, k); + + T = TS; + for (i = 0; i < nbgeqrt; i++) { p = qrtree->getm(qrtree, k, i); temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; tempkmin = chameleon_min(tempkm, temppn); @@ -110,6 +105,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempkm, temppn, ib, T->nb, A( k, p), ldak, T(k, p), T->mb); + if ( genD ) { INSERT_TASK_zlacpy( &options, @@ -124,13 +120,14 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t D(k, p), lddk ); #endif } + for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m); INSERT_TASK_zunmlq( &options, ChamRight, ChamConjTrans, - tempmm, temppn, tempkmin, ib, T->nb, + tempmm, temppn, tempkmin, ib, T->nb, D(k, p), lddk, T(k, p), T->mb, A(m, p), ldam); @@ -140,15 +137,15 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t } /* Setting the order of the tiles */ - libhqr_walk_stepk( qrtree, k, tiles + (k+1) ); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->nt; i++) { + for (i = 0; i < nbtiles; i++) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - if (qrtree->gettype(qrtree, k, n) == 0) { + if ( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ T = TS; L = 0; @@ -159,10 +156,9 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t L = tempnn; } - RUNTIME_data_migrate( sequence, A(k, p), - A->get_rankof( A, k, n ) ); - RUNTIME_data_migrate( sequence, A(k, n), - A->get_rankof( A, k, n ) ); + node = A->get_rankof( A, k, n ); + RUNTIME_data_migrate( sequence, A(k, p), node ); + RUNTIME_data_migrate( sequence, A(k, n), node ); INSERT_TASK_ztplqt( &options, @@ -175,10 +171,9 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m); - RUNTIME_data_migrate( sequence, A(m, p), - A->get_rankof( A, m, n ) ); - RUNTIME_data_migrate( sequence, A(m, n), - A->get_rankof( A, m, n ) ); + node = A->get_rankof( A, m, n ); + RUNTIME_data_migrate( sequence, A(m, p), node ); + RUNTIME_data_migrate( sequence, A(m, n), node ); INSERT_TASK_ztpmlqt( &options, diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index 6dd19a90e..55c3121b2 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -46,7 +46,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM int K, N, RD; int ldak, ldam, lddk; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -167,10 +167,9 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM for (N = k; N+RD < A->nt; N += 2*RD) { tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; - RUNTIME_data_migrate( sequence, A(k, N), - A->get_rankof( A, k, N+RD ) ); - RUNTIME_data_migrate( sequence, A(k, N+RD), - A->get_rankof( A, k, N+RD ) ); + node = A->get_rankof( A, k, N+RD ); + RUNTIME_data_migrate( sequence, A(k, N), node ); + RUNTIME_data_migrate( sequence, A(k, N+RD), node ); /* TT kernel */ INSERT_TASK_ztplqt( @@ -184,10 +183,9 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m ); - RUNTIME_data_migrate( sequence, A(m, N), - A->get_rankof( A, m, N+RD ) ); - RUNTIME_data_migrate( sequence, A(m, N+RD), - A->get_rankof( A, m, N+RD ) ); + node = A->get_rankof( A, m, N+RD ); + RUNTIME_data_migrate( sequence, A(m, N), node ); + RUNTIME_data_migrate( sequence, A(m, N+RD), node ); INSERT_TASK_ztpmlqt( &options, diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index 01c0a816f..cc63e3a18 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -22,16 +22,21 @@ #include <stdlib.h> #include "libhqr.h" -#define A(m,n) A, (m), (n) -#define T(m,n) T, (m), (n) -#define D(m,n) D, (m), (n) +#define A(m,n) A, (m), (n) +#define T(m,n) T, (m), (n) +#define D(m,n) D, (m), (n) /** * Parallel tile QR factorization (reduction Householder) - dynamic scheduling + * + * @param[in] genD + * Indicate if the copies of the geqrt tiles must be done to speedup + * computations in updates. */ -void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, +void chameleon_pzgeqrf_param( int genD, int K, + const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; @@ -40,11 +45,10 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t size_t ws_host = 0; int k, m, n, i, p; - int K, L, nbgeqrt; + int L, nbgeqrt; int ldap, ldam, lddm; int tempkmin, tempkn, tempnn, tempmm; - int ib; - int *tiles; + int ib, node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -75,23 +79,22 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of temporary tiles array */ - tiles = (int*)calloc(qrtree->mt, sizeof(int)); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - K = chameleon_min(A->mt, A->nt); + /* Initialisation of temporary tiles array */ + tiles = (int*)calloc(qrtree->mt, sizeof(int)); - /* The number of the factorization */ for (k = 0; k < K; k++) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; /* The number of geqrt to apply */ nbgeqrt = qrtree->getnbgeqrf(qrtree, k); + + T = TS; for (i = 0; i < nbgeqrt; i++) { m = qrtree->getm(qrtree, k, i); tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; @@ -99,13 +102,12 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ldam = BLKLDD(A, m); lddm = BLKLDD(D, m); - T = TS; - INSERT_TASK_zgeqrt( &options, tempmm, tempkn, ib, T->nb, A(m, k), ldam, T(m, k), T->mb); + if ( genD ) { INSERT_TASK_zlacpy( &options, @@ -120,6 +122,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t D(m, k), lddm ); #endif } + for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; INSERT_TASK_zunmqr( @@ -135,9 +138,9 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t } /* Setting the order of the tiles */ - libhqr_walk_stepk( qrtree, k, tiles + (k+1) ); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->mt; i++) { + for (i = 0; i < nbtiles; i++) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); @@ -145,7 +148,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ldap = BLKLDD(A, p); ldam = BLKLDD(A, m); - if (qrtree->gettype(qrtree, k, m) == 0) { + if ( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ T = TS; L = 0; @@ -156,10 +159,9 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t L = tempmm; } - RUNTIME_data_migrate( sequence, A(p, k), - A->get_rankof( A, m, k ) ); - RUNTIME_data_migrate( sequence, A(m, k), - A->get_rankof( A, m, k ) ); + node = A->get_rankof( A, m, k ); + RUNTIME_data_migrate( sequence, A(p, k), node ); + RUNTIME_data_migrate( sequence, A(m, k), node ); INSERT_TASK_ztpqrt( &options, @@ -171,10 +173,9 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - RUNTIME_data_migrate( sequence, A(p, n), - A->get_rankof( A, m, n ) ); - RUNTIME_data_migrate( sequence, A(m, n), - A->get_rankof( A, m, n ) ); + node = A->get_rankof( A, m, n ); + RUNTIME_data_migrate( sequence, A(p, n), node ); + RUNTIME_data_migrate( sequence, A(m, n), node ); INSERT_TASK_ztpmqrt( &options, diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index cab40dd42..1eeb7cdbc 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -46,7 +46,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM int K, M, RD; int ldaM, ldam, ldaMRD, lddM; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -166,10 +166,9 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM ldaM = BLKLDD(A, M ); ldaMRD = BLKLDD(A, M+RD); - RUNTIME_data_migrate( sequence, A(M, k), - A->get_rankof( A, M+RD, k ) ); - RUNTIME_data_migrate( sequence, A(M+RD, k), - A->get_rankof( A, M+RD, k ) ); + node = A->get_rankof( A, M+RD, k ); + RUNTIME_data_migrate( sequence, A(M, k), node ); + RUNTIME_data_migrate( sequence, A(M+RD, k), node ); /* TT kernel */ INSERT_TASK_ztpqrt( @@ -182,10 +181,9 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - RUNTIME_data_migrate( sequence, A(M, n), - A->get_rankof( A, M+RD, n ) ); - RUNTIME_data_migrate( sequence, A(M+RD, n), - A->get_rankof( A, M+RD, n ) ); + node = A->get_rankof( A, M+RD, n ); + RUNTIME_data_migrate( sequence, A(M, n), node ); + RUNTIME_data_migrate( sequence, A(M+RD, n), node ); INSERT_TASK_ztpmqrt( &options, diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c index f2a68870a..f6062aa3d 100644 --- a/compute/pztpgqrt.c +++ b/compute/pztpgqrt.c @@ -20,8 +20,6 @@ */ #include "control/common.h" -#define V1(m,n) V1, m, n -#define T1(m,n) T1, m, n #define V2(m,n) V2, m, n #define T2(m,n) T2, m, n #define Q1(m,n) Q1, m, n @@ -31,11 +29,9 @@ /** * Parallel tile QR factorization - dynamic scheduling */ -void chameleon_pztpgqrt( int genD, int L, - CHAM_desc_t *V1, CHAM_desc_t *T1, +void chameleon_pztpgqrt( int KT, int L, CHAM_desc_t *V2, CHAM_desc_t *T2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, - CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -46,7 +42,7 @@ void chameleon_pztpgqrt( int genD, int L, int k, m, n; int ldvk, ldvm, lddk; int ldqk, ldqm; - int tempkm, tempkn, tempkk, tempnn, tempmm, templm; + int tempkn, tempnn, tempmm, templm; int ib; /* Dimension of the first column */ @@ -61,11 +57,6 @@ void chameleon_pztpgqrt( int genD, int L, ib = CHAMELEON_IB; - if ( D == NULL ) { - D = V1; - genD = 0; - } - /* * ztpmqrt = Q1->nb * ib */ @@ -85,21 +76,17 @@ void chameleon_pztpgqrt( int genD, int L, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - for (k = V1->nt-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == V1->mt-1 ? V1->m-k*V1->mb : V1->mb; - tempkk = k == V1->nt-1 ? V1->n-k*V1->nb : V1->nb; tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb; - ldvk = BLKLDD(V1, k); - lddk = BLKLDD(D, k); ldqk = BLKLDD(Q1, k); /* Equivalent to the tsmqr step on Q1,Q2 */ maxmtk = chameleon_min( Q2->mt, maxmt+k ) - 1; for (m = maxmtk; m > -1; m--) { tempmm = m == Q2->mt-1 ? Q2->m-m*Q2->mb : Q2->mb; - templm = m == maxmtk ? tempmm : 0; + templm = ((L > 0) && (m == maxmtk)) ? tempmm : 0; ldvm = BLKLDD(V2, m); ldqm = BLKLDD(Q2, m); @@ -117,53 +104,9 @@ void chameleon_pztpgqrt( int genD, int L, } } - for (m = Q1->mt - 1; m > k; m--) { - tempmm = m == Q1->mt-1 ? Q1->m-m*Q1->mb : Q1->mb; - ldvm = BLKLDD(V1, m); - ldqm = BLKLDD(Q1, m); - for (n = k; n < Q1->nt; n++) { - tempnn = n == Q1->nt-1 ? Q1->n-n*Q1->nb : Q1->nb; - /* TS kernel */ - INSERT_TASK_ztpmqrt( - &options, - ChamLeft, ChamNoTrans, - tempmm, tempnn, tempkn, 0, ib, T1->nb, - V1(m, k), ldvm, - T1(m, k), T1->mb, - Q1(k, n), ldqk, - Q1(m, n), ldqm ); - } - } - - if ( genD ) { - INSERT_TASK_zlacpy( - &options, - ChamLower, tempkm, tempkk, V1->nb, - V1(k, k), ldvk, - D(k), lddk ); -#if defined(CHAMELEON_USE_CUDA) - INSERT_TASK_zlaset( - &options, - ChamUpper, tempkm, tempkk, - 0., 1., - D(k), lddk ); -#endif - } - for (n = k; n < Q1->nt; n++) { - tempnn = n == Q1->nt-1 ? Q1->n-n*Q1->nb : Q1->nb; - INSERT_TASK_zunmqr( - &options, - ChamLeft, ChamNoTrans, - tempkm, tempnn, tempkk, ib, T1->nb, - D(k), lddk, - T1(k, k), T1->mb, - Q1(k, n), ldqk); - } - RUNTIME_iteration_pop(chamctxt); } RUNTIME_options_ws_free(&options); RUNTIME_options_finalize(&options, chamctxt); - (void)D; } diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c index fb2af03ec..37de659fe 100644 --- a/compute/pztpqrt.c +++ b/compute/pztpqrt.c @@ -20,9 +20,9 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define B(m,n) B, m, n -#define T(m,n) T, m, n +#define A(m,n) A, (m), (n) +#define B(m,n) B, (m), (n) +#define T(m,n) T, (m), (n) /** * Parallel tile QR factorization - dynamic scheduling @@ -53,15 +53,14 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, ib = CHAMELEON_IB; /* - * ztsqrt = A->nb * (ib+1) + * ztpqrt = A->nb * (ib+1) * ztpmqrt = A->nb * ib */ ws_worker = A->nb * (ib+1); /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) - /* Worker space - * + /* * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); @@ -81,7 +80,7 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, for (m = 0; m < maxmt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - templm = m == maxmt-1 ? tempmm : 0; + templm = ((L > 0) && (m == maxmt-1)) ? tempmm : 0; ldbm = BLKLDD(B, m); /* TT kernel */ INSERT_TASK_ztpqrt( diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index 4a82007a5..a6cc32e6f 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -43,8 +43,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int K, L; int ldak, ldqm, lddk; int tempkm, tempkmin, temppn, tempnn, tempmm; - int ib; - int *tiles; + int ib, node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -77,15 +76,14 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of tiles */ - - tiles = (int*)calloc( qrtree->mt, sizeof(int)); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* Initialisation of tiles */ + tiles = (int*)calloc( qrtree->mt, sizeof(int)); + K = chameleon_min(A->mt, A->nt); for (k = K-1; k >= 0; k--) { @@ -96,15 +94,15 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = A->nt-1; i > k; i--) { + for (i = nbtiles-1; i >= 0; i--) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - if(qrtree->gettype(qrtree, k, n) == 0){ + if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -118,10 +116,9 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; ldqm = BLKLDD(Q, m); - RUNTIME_data_migrate( sequence, Q(m, p), - Q->get_rankof( Q, m, n ) ); - RUNTIME_data_migrate( sequence, Q(m, n), - Q->get_rankof( Q, m, n ) ); + node = Q->get_rankof( Q, m, n ); + RUNTIME_data_migrate( sequence, Q(m, p), node ); + RUNTIME_data_migrate( sequence, Q(m, n), node ); INSERT_TASK_ztpmlqt( &options, diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 4cde3dc14..e8a6bdac9 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -48,7 +48,7 @@ void chameleon_pzunglqrh( int genD, int BS, int K, N, RD, lastRD; int ldak, lddk, ldqm; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -99,10 +99,9 @@ void chameleon_pzunglqrh( int genD, int BS, tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; ldqm = BLKLDD(Q, m ); - RUNTIME_data_migrate( sequence, Q(m, N), - Q->get_rankof( Q, m, N+RD ) ); - RUNTIME_data_migrate( sequence, Q(m, N+RD), - Q->get_rankof( Q, m, N+RD ) ); + node = Q->get_rankof( Q, m, N+RD ); + RUNTIME_data_migrate( sequence, Q(m, N), node ); + RUNTIME_data_migrate( sequence, Q(m, N+RD), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( @@ -129,10 +128,9 @@ void chameleon_pzunglqrh( int genD, int BS, tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; ldqm = BLKLDD(Q, m); - RUNTIME_data_migrate( sequence, Q(m, N), - Q->get_rankof( Q, m, n ) ); - RUNTIME_data_migrate( sequence, Q(m, n), - Q->get_rankof( Q, m, n ) ); + node = Q->get_rankof( Q, m, n ); + RUNTIME_data_migrate( sequence, Q(m, N), node ); + RUNTIME_data_migrate( sequence, Q(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 0bf5b2826..d46024821 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -29,7 +29,8 @@ /** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling */ -void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, +void chameleon_pzungqr_param( int genD, int K, + const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) @@ -43,8 +44,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, int k, m, n, i, p, L; int ldam, ldqm, ldqp, lddm; int tempmm, tempnn, tempkmin, tempkn; - int ib, minMT; - int *tiles; + int ib, nbgeqrt, node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -53,12 +53,6 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, ib = CHAMELEON_IB; - if (A->m > A->n) { - minMT = A->nt; - } else { - minMT = A->mt; - } - if (D == NULL) { D = A; genD = 0; @@ -66,47 +60,44 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, /* * zunmqr = A->nb * ib - * ztsmqr = A->nb * ib + * ztpmqr = A->nb * ib */ ws_worker = A->nb * ib; /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_CUDA) - /* Worker space - * - * zunmqr = A->nb * ib - * ztsmqr = 2 * A->nb * ib + /* + * ztpmqrt = 2 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of tiles */ - - tiles = (int*)calloc(qrtree->mt, sizeof(int)); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - for (k = minMT-1; k >= 0; k--) { + /* Initialisation of temporary tiles array */ + tiles = (int*)calloc(qrtree->mt, sizeof(int)); + + for (k = K-1; k >=0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; /* Setting the order of tiles */ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = Q->mt-1; i > k; i--) { + for (i = nbtiles-1; i >= 0; i--) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; + ldqp = BLKLDD(Q, p); ldam = BLKLDD(A, m); ldqm = BLKLDD(Q, m); - ldqp = BLKLDD(Q, p); - if(qrtree->gettype(qrtree, k , m) == 0) { + if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ T = TS; L = 0; @@ -120,10 +111,9 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - RUNTIME_data_migrate( sequence, Q(p, n), - Q->get_rankof( Q, m, n ) ); - RUNTIME_data_migrate( sequence, Q(m, n), - Q->get_rankof( Q, m, n ) ); + node = Q->get_rankof( Q, m, n ); + RUNTIME_data_migrate( sequence, Q(p, n), node ); + RUNTIME_data_migrate( sequence, Q(m, n), node ); INSERT_TASK_ztpmqrt( &options, @@ -139,7 +129,10 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, } T = TS; - for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { + + /* The number of geqrt to apply */ + nbgeqrt = qrtree->getnbgeqrf(qrtree, k); + for (i = 0; i < nbgeqrt; i++) { m = qrtree->getm(qrtree, k, i); tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index 78f218525..1b9036e6c 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -51,7 +51,7 @@ void chameleon_pzungqrrh( int genD, int BS, int ldaM, ldam, ldaMRD, lddM; int ldqM, ldqm, ldqMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -68,7 +68,7 @@ void chameleon_pzungqrrh( int genD, int BS, /* * zunmqr = A->nb * ib * ztsmqr = A->nb * ib - * zttmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -103,10 +103,9 @@ void chameleon_pzungqrrh( int genD, int BS, for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - RUNTIME_data_migrate( sequence, Q(M, n), - Q->get_rankof( Q, M+RD, n ) ); - RUNTIME_data_migrate( sequence, Q(M+RD, n), - Q->get_rankof( Q, M+RD, n ) ); + node = Q->get_rankof( Q, M+RD, n ); + RUNTIME_data_migrate( sequence, Q(M, n), node ); + RUNTIME_data_migrate( sequence, Q(M+RD, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( @@ -137,10 +136,9 @@ void chameleon_pzungqrrh( int genD, int BS, for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - RUNTIME_data_migrate( sequence, Q(M, n), - Q->get_rankof( Q, m, n ) ); - RUNTIME_data_migrate( sequence, Q(m, n), - Q->get_rankof( Q, m, n ) ); + node = Q->get_rankof( Q, m, n ); + RUNTIME_data_migrate( sequence, Q(M, n), node ); + RUNTIME_data_migrate( sequence, Q(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index 82cbe0c04..545431e82 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -45,7 +45,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, int ldbm, ldak, ldbp, lddk; int tempnn, temppn, tempkmin, tempmm, tempkm; int ib, K, L; - int *tiles; + int node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -77,14 +77,14 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of tiles */ - tiles = (int*)calloc( qrtree->mt, sizeof(int) ); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* Initialisation of tiles */ + tiles = (int*)calloc( qrtree->mt, sizeof(int) ); + if (side == ChamLeft ) { if (trans == ChamNoTrans) { /* @@ -135,9 +135,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, } /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->nt; i++) { + for (i = 0; i < nbtiles; i++) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); @@ -145,7 +145,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ldbp = BLKLDD(B, p); ldbm = BLKLDD(B, m); - if(qrtree->gettype(qrtree, k, m) == 0){ + if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -158,10 +158,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(p, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(p, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmlqt( &options, @@ -197,9 +196,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = A->nt-1; i > k; i--) { + for (i = nbtiles-1; i >= 0; i--) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); @@ -207,7 +206,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ldbp = BLKLDD(B, p); ldbm = BLKLDD(B, m); - if(qrtree->gettype(qrtree, k, m) == 0){ + if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -220,10 +219,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(p, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(p, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmlqt( &options, @@ -296,15 +294,15 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = A->nt-1; i > k; i--) { + for (i = nbtiles-1; i >= 0; i--) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - if(qrtree->gettype(qrtree, k, n) == 0){ + if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -318,10 +316,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, p), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, p), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmlqt( &options, @@ -429,15 +426,15 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, } /* Setting the order of tiles */ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->nt; i++) { + for (i = 0; i < nbtiles; i++) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - if(qrtree->gettype(qrtree, k, n) == 0){ + if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -452,10 +449,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, p), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, p), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmlqt( &options, diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 19f411ac4..3ad2c4eae 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -49,7 +49,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans int K, N, RD, lastRD; int ldak, lddk, ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -135,10 +135,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(N, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(N, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -161,10 +160,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(N, n), - B->get_rankof( B, N+RD, n ) ); - RUNTIME_data_migrate( sequence, B(N+RD, n), - B->get_rankof( B, N+RD, n ) ); + node = B->get_rankof( B, N+RD, n ); + RUNTIME_data_migrate( sequence, B(N, n), node ); + RUNTIME_data_migrate( sequence, B(N+RD, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( @@ -210,10 +208,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(N, n), - B->get_rankof( B, N+RD, n ) ); - RUNTIME_data_migrate( sequence, B(N+RD, n), - B->get_rankof( B, N+RD, n ) ); + node = B->get_rankof( B, N+RD, n ); + RUNTIME_data_migrate( sequence, B(N, n), node ); + RUNTIME_data_migrate( sequence, B(N+RD, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( @@ -239,10 +236,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(N, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(N, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -314,10 +310,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans ldbm = BLKLDD(B, m); tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, N), - B->get_rankof( B, m, N+RD ) ); - RUNTIME_data_migrate( sequence, B(m, N+RD), - B->get_rankof( B, m, N+RD ) ); + node = B->get_rankof( B, m, N+RD ); + RUNTIME_data_migrate( sequence, B(m, N), node ); + RUNTIME_data_migrate( sequence, B(m, N+RD), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( @@ -342,10 +337,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, N), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, m), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, N), node ); + RUNTIME_data_migrate( sequence, B(m, m), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -444,10 +438,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, N), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, N), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -470,10 +463,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, N), - B->get_rankof( B, m, N+RD ) ); - RUNTIME_data_migrate( sequence, B(m, N+RD), - B->get_rankof( B, m, N+RD ) ); + node = B->get_rankof( B, m, N+RD ); + RUNTIME_data_migrate( sequence, B(m, N), node ); + RUNTIME_data_migrate( sequence, B(m, N+RD), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index 27e8cb192..cd23e8114 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -45,7 +45,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, int ldam, ldan, ldbm, ldbp, lddn, lddm; int tempnn, tempkmin, tempmm, tempkn; int ib, K, L; - int *tiles; + int node, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -64,7 +64,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, /* * zunmqr = A->nb * ib * ztsmqr = A->nb * ib - * zttmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -77,14 +77,14 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); #endif - /* Initialisation of tiles */ - tiles = (int*)calloc( qrtree->mt, sizeof(int) ); - ws_worker *= sizeof(CHAMELEON_Complex64_t); ws_host *= sizeof(CHAMELEON_Complex64_t); RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* Initialisation of tiles */ + tiles = (int*)calloc( qrtree->mt, sizeof(int) ); + if (side == ChamLeft ) { if (trans == ChamConjTrans) { /* @@ -134,9 +134,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, } /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->mt; i++) { + for (i = 0; i < nbtiles; i++) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); @@ -145,7 +145,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ldbm = BLKLDD(B, m); ldbp = BLKLDD(B, p); - if(qrtree->gettype(qrtree, k, m) == 0){ + if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -158,10 +158,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(p, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(p, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmqrt( &options, @@ -195,9 +194,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = A->mt-1; i > k; i--) { + for (i = nbtiles-1; i >=0; i--) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); @@ -206,7 +205,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ldbm = BLKLDD(B, m); ldbp = BLKLDD(B, p); - if(qrtree->gettype(qrtree, k, m) == 0){ + if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -219,10 +218,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, for (n = k; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(p, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(p, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmqrt( &options, @@ -295,16 +293,16 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempkn = k == A->nt-1 ? A->n - k*A->nb : A->nb; /* Setting the order of the tiles*/ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = A->nt-1; i > k; i--) { + for (i = nbtiles-1; i >= 0; i--) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; ldan = BLKLDD(A, n); - if( qrtree->gettype(qrtree, k, n) == 0 ) { + if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -319,10 +317,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, p), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, p), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmqrt( &options, @@ -429,16 +426,16 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, } /* Setting the order of tiles */ - libhqr_walk_stepk(qrtree, k, tiles + (k+1)); + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = k+1; i < A->nt; i++) { + for (i = 0; i < nbtiles; i++) { n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; ldan = BLKLDD(A, n); - if( qrtree->gettype(qrtree, k, n) == 0 ) { + if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ L = 0; T = TS; @@ -453,10 +450,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, p), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, p), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); INSERT_TASK_ztpmqrt( &options, diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index ab51157ba..efb0f13d5 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -50,7 +50,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans int ldaM, ldam, ldan, ldaMRD, lddM; int ldbM, ldbm, ldbMRD; int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; - int ib; + int ib, node; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -67,7 +67,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans /* * zunmqr = A->nb * ib * ztsmqr = A->nb * ib - * zttmqr = A->nb * ib + * ztpmqrt = A->nb * ib */ ws_worker = A->nb * ib; @@ -135,10 +135,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(M, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(M, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -162,10 +161,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(M, n), - B->get_rankof( B, M+RD, n ) ); - RUNTIME_data_migrate( sequence, B(M+RD, n), - B->get_rankof( B, M+RD, n ) ); + node = B->get_rankof( B, M+RD, n ); + RUNTIME_data_migrate( sequence, B(M, n), node ); + RUNTIME_data_migrate( sequence, B(M+RD, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( @@ -209,10 +207,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(M, n), - B->get_rankof( B, M+RD, n ) ); - RUNTIME_data_migrate( sequence, B(M+RD, n), - B->get_rankof( B, M+RD, n ) ); + node = B->get_rankof( B, M+RD, n ); + RUNTIME_data_migrate( sequence, B(M, n), node ); + RUNTIME_data_migrate( sequence, B(M+RD, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( @@ -240,10 +237,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(M, n), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(M, n), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -311,10 +307,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans ldbm = BLKLDD(B, m); tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, M), - B->get_rankof( B, m, M+RD ) ); - RUNTIME_data_migrate( sequence, B(m, M+RD), - B->get_rankof( B, m, M+RD ) ); + node = B->get_rankof( B, m, M+RD ); + RUNTIME_data_migrate( sequence, B(m, M), node ); + RUNTIME_data_migrate( sequence, B(m, M+RD), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( @@ -341,10 +336,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans ldbm = BLKLDD(B, m); tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, M), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, m), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, M), node ); + RUNTIME_data_migrate( sequence, B(m, m), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -441,10 +435,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, M), - B->get_rankof( B, m, n ) ); - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + node = B->get_rankof( B, m, n ); + RUNTIME_data_migrate( sequence, B(m, M), node ); + RUNTIME_data_migrate( sequence, B(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -467,10 +460,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, M), - B->get_rankof( B, m, M+RD ) ); - RUNTIME_data_migrate( sequence, B(m, M+RD), - B->get_rankof( B, m, M+RD ) ); + node = B->get_rankof( B, m, M+RD ); + RUNTIME_data_migrate( sequence, B(m, M), node ); + RUNTIME_data_migrate( sequence, B(m, M+RD), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( diff --git a/compute/zgels_param.c b/compute/zgels_param.c index b6beda808..7b41e4cdf 100644 --- a/compute/zgels_param.c +++ b/compute/zgels_param.c @@ -396,7 +396,8 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n); subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n); - chameleon_pzgeqrf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request ); + chameleon_pzgeqrf_param( 1, A->nt, qrtree, A, + TS, TT, Dptr, sequence, request ); chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request ); chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request ); } diff --git a/compute/zgeqrf_param.c b/compute/zgeqrf_param.c index 20a24cfe9..34171ce22 100644 --- a/compute/zgeqrf_param.c +++ b/compute/zgeqrf_param.c @@ -122,14 +122,14 @@ int CHAMELEON_zgeqrf_param( const libhqr_tree_t *qrtree, int M, int N, /* Submit the matrix conversion */ chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); /* Call the tile interface */ CHAMELEON_zgeqrf_param_Tile_Async( qrtree, &descAt, descTS, descTT, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, - ChamDescInout, ChamUpperLower, sequence, &request ); + ChamDescInout, ChamUpperLower, sequence, &request ); CHAMELEON_Desc_Flush( descTS, sequence ); CHAMELEON_Desc_Flush( descTT, sequence ); @@ -248,6 +248,7 @@ int CHAMELEON_zgeqrf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t { CHAM_context_t *chamctxt; CHAM_desc_t D, *Dptr = NULL; + int KT; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -293,6 +294,14 @@ int CHAMELEON_zgeqrf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t if (chameleon_min(M, N) == 0) return CHAMELEON_SUCCESS; */ + + if ( A->m < A->n ) { + KT = A->mt; + } + else { + KT = A->nt; + } + #if defined(CHAMELEON_COPY_DIAG) { int n = chameleon_min(A->m, A->n); @@ -301,7 +310,8 @@ int CHAMELEON_zgeqrf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t } #endif - chameleon_pzgeqrf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request ); + chameleon_pzgeqrf_param( 1, KT, qrtree, A, + TS, TT, Dptr, sequence, request ); if (Dptr != NULL) { CHAMELEON_Desc_Flush( A, sequence ); diff --git a/compute/ztpgqrt.c b/compute/ztpgqrt.c index f28a20191..0512caeda 100644 --- a/compute/ztpgqrt.c +++ b/compute/ztpgqrt.c @@ -340,6 +340,7 @@ int CHAMELEON_ztpgqrt_Tile_Async( int L, { CHAM_context_t *chamctxt; CHAM_desc_t D, *Dptr = NULL; + int KT; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -396,15 +397,16 @@ int CHAMELEON_ztpgqrt_Tile_Async( int L, chameleon_error("CHAMELEON_ztpgqrt_Tile", "Triangular part must be aligned with tiles"); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); } + + if (V1->m > V1->n) { + KT = V1->nt; + } else { + KT = V1->mt; + } + #if defined(CHAMELEON_COPY_DIAG) { - int minMT; - if (V1->m > V1->n) { - minMT = V1->nt; - } else { - minMT = V1->mt; - } - chameleon_zdesc_alloc_diag(D, V1->mb, V1->nb, minMT*V1->mb, V1->nb, 0, 0, minMT*V1->mb, V1->nb, V1->p, V1->q); + chameleon_zdesc_alloc(D, V1->mb, V1->nb, V1->m, KT*V1->nb, 0, 0, V1->m, KT*V1->nb, ); Dptr = &D; } #endif @@ -412,7 +414,8 @@ int CHAMELEON_ztpgqrt_Tile_Async( int L, /* if (chamctxt->householder == ChamFlatHouseholder) { */ chameleon_pzlaset( ChamUpperLower, 0., 1., Q1, sequence, request ); chameleon_pzlaset( ChamUpperLower, 0., 0., Q2, sequence, request ); - chameleon_pztpgqrt( 1, L, V1, T1, V2, T2, Q1, Q2, Dptr, sequence, request ); + chameleon_pztpgqrt( KT, L, V2, T2, Q1, Q2, sequence, request ); + chameleon_pzungqr( 1, V1, Q1, T1, Dptr, sequence, request ); if (Dptr != NULL) { CHAMELEON_Desc_Flush( V1, sequence ); diff --git a/compute/zungqr_param.c b/compute/zungqr_param.c index faf5f6db9..0f17560c3 100644 --- a/compute/zungqr_param.c +++ b/compute/zungqr_param.c @@ -75,11 +75,11 @@ * */ int CHAMELEON_zungqr_param( const libhqr_tree_t *qrtree, - int M, int N, int K, - CHAMELEON_Complex64_t *A, int LDA, - CHAM_desc_t *descTS, - CHAM_desc_t *descTT, - CHAMELEON_Complex64_t *Q, int LDQ ) + int M, int N, int K, + CHAMELEON_Complex64_t *A, int LDA, + CHAM_desc_t *descTS, + CHAM_desc_t *descTT, + CHAMELEON_Complex64_t *Q, int LDQ ) { int NB; int status; @@ -253,6 +253,8 @@ int CHAMELEON_zungqr_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t { CHAM_context_t *chamctxt; CHAM_desc_t D, *Dptr = NULL; + int KT; + chamctxt = chameleon_context_self(); if (chamctxt == NULL) { chameleon_fatal_error("CHAMELEON_zungqr_param_Tile", "CHAMELEON not initialized"); @@ -301,6 +303,13 @@ int CHAMELEON_zungqr_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t if (N <= 0) return CHAMELEON_SUCCESS; */ + if ( A->m < A->n ) { + KT = A->mt; + } + else { + KT = A->nt; + } + #if defined(CHAMELEON_COPY_DIAG) { int n = chameleon_min(A->m, A->n); @@ -310,7 +319,7 @@ int CHAMELEON_zungqr_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t #endif chameleon_pzlaset( ChamUpperLower, 0., 1., Q, sequence, request ); - chameleon_pzungqr_param( 1, qrtree, A, Q, TS, TT, Dptr, sequence, request ); + chameleon_pzungqr_param( 1, KT, qrtree, A, Q, TS, TT, Dptr, sequence, request ); if (Dptr != NULL) { CHAMELEON_Desc_Flush( A, sequence ); diff --git a/coreblas/compute/core_ztpmlqt.c b/coreblas/compute/core_ztpmlqt.c index 9eca91bd3..72e54b1ad 100644 --- a/coreblas/compute/core_ztpmlqt.c +++ b/coreblas/compute/core_ztpmlqt.c @@ -158,7 +158,7 @@ int CORE_ztpmlqt( cham_side_t side, cham_trans_t trans, else { m1 = M; n1 = K; - ldwork = chameleon_max( n1, N ); + ldwork = chameleon_max( K, chameleon_max( M, N ) ); } /* TS case */ diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c index 3b5721fc7..5909f19ee 100644 --- a/coreblas/compute/core_ztpmqrt.c +++ b/coreblas/compute/core_ztpmqrt.c @@ -159,7 +159,7 @@ int CORE_ztpmqrt( cham_side_t side, cham_trans_t trans, else { m1 = M; n1 = K; - ldwork = m1; + ldwork = chameleon_max( K, chameleon_max( M, N ) ); } /* TS case */ -- GitLab