diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index 0aa1054dce5da9ffd09a83d7a36ecb0f25aea1bf..ac2cb0aa76116c1cf70f6cd3efaa0f9621ff5b9b 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -27,7 +27,7 @@ #include "control/common.h" #define A(m,n) A, m, n -#define B(m,n) B, m, n +#define C(m,n) C, m, n #define T(m,n) T, m, n #define D(k) D, k, k @@ -35,7 +35,7 @@ * Parallel application of Q using tile V - LQ factorization - dynamic scheduling */ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -44,9 +44,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldbk, ldbm, lddk; - int tempmm, tempnn, tempkn, tempkm, tempkmin; - int ib, minMT, minM; + int ldak, ldck, ldcm, lddk; + int tempkm, tempkn, tempkmin, tempmm, tempnn; + int ib, KT, K; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -57,11 +57,11 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, ib = CHAMELEON_IB; if (A->m > A->n) { - minM = A->n; - minMT = A->nt; + KT = A->nt; + K = A->n; } else { - minM = A->m; - minMT = A->mt; + KT = A->mt; + K = A->m; } if ( D == NULL ) { @@ -94,13 +94,14 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, /* * ChamLeft / ChamNoTrans */ - for (k = 0; k < minMT; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; + tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; + ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); + ldck = BLKLDD(C, k); lddk = BLKLDD(D, k); if ( genD ) { @@ -118,28 +119,28 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, D(k), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmlq( &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(k, n), ldbk); + C(k, n), ldck); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); - for (m = k+1; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (m = k+1; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -148,8 +149,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(k, n), ldbk, - B(m, n), ldbm); + C(k, n), ldck, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); @@ -157,9 +158,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); @@ -169,23 +170,24 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, * ChamLeft / ChamConjTrans */ else { - for (k = minMT-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; + tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; + ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); + ldck = BLKLDD(C, k); lddk = BLKLDD(D, k); - for (m = B->mt-1; m > k; m--) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (m = C->mt-1; m > k; m--) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -194,13 +196,14 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(k, n), ldbk, - B(m, n), ldbm); + C(k, n), ldck, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); } + if ( genD ) { int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( @@ -216,11 +219,11 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, D(k), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); INSERT_TASK_zunmlq( &options, @@ -228,7 +231,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkm, tempnn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(k, n), ldbk); + C(k, n), ldck); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); @@ -241,22 +244,22 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, */ else { if (trans == ChamNoTrans) { - for (k = minMT-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb; - tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb; + tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); - for (n = B->nt-1; n > k; n--) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (n = C->nt-1; n > k; n--) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -265,13 +268,14 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, k), ldbm, - B(m, n), ldbm); + C(m, k), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); } + if ( genD ) { int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( @@ -287,12 +291,12 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, D(k), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); INSERT_TASK_zunmlq( &options, @@ -300,7 +304,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempkn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(m, k), ldbm); + C(m, k), ldcm); } RUNTIME_data_flush( sequence, D(k) ); @@ -313,16 +317,17 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, * ChamRight / ChamConjTrans */ else { - for (k = 0; k < minMT; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; - tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb; + tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); if ( genD ) { int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, @@ -336,29 +341,29 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, D(k), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); INSERT_TASK_zunmlq( &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(m, k), ldbm); + C(m, k), ldcm); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); - for (n = k+1; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (n = k+1; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -367,8 +372,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, k), ldbm, - B(m, n), ldbm); + C(m, k), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); @@ -376,9 +381,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt); diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index fe122e0b4798aa795b881a94173c82df250d9336..6e27ca3f610cbb139e0f37edc127e69097fcedab 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -22,7 +22,7 @@ #include <stdlib.h> #define A(m,n) A, m, n -#define B(m,n) B, m, n +#define C(m,n) C, m, n #define T(m,n) T, m, n #define D(m,n) D, m, n @@ -31,7 +31,7 @@ */ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { @@ -42,9 +42,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldbm, ldak, ldbp, lddk; - int tempnn, temppn, tempkmin, tempmm, tempkm; - int ib, K, L; + int ldak, lddk, ldcp, ldcm; + int temppm, temppn, tempmm, tempnn, tempkm,tempkmin; + int ib, KT, L; int node, nbtiles, *tiles; chamctxt = chameleon_context_self(); @@ -55,8 +55,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ib = CHAMELEON_IB; - K = chameleon_min(A->mt, A->nt); - if ( D == NULL ) { D = A; genD = 0; @@ -85,15 +83,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, /* Initialisation of tiles */ tiles = (int*)calloc( qrtree->mt, sizeof(int) ); + KT = chameleon_min( A->mt, A->nt ); if (side == ChamLeft ) { if (trans == ChamNoTrans) { /* * ChamLeft / ChamNoTrans */ - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkm = k == A->mt - 1 ? A->m - k * A->mb : A->mb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); @@ -101,12 +100,14 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { p = qrtree->getm(qrtree, k, i); - temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; - tempkmin = chameleon_min(tempkm, temppn); - ldbp = BLKLDD(B, p); + temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkm ); + + ldcp = BLKLDD(C, p); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, @@ -120,17 +121,15 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, D(k, p), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmlq( - &options, - side, trans, - temppn, tempnn, tempkmin, ib, T->nb, + &options, side, trans, + temppm, tempnn, tempkmin, ib, T->nb, D(k, p), lddk, T(k, p), T->mb, - B(p, n), ldbp); + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); } @@ -142,9 +141,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbp = BLKLDD(B, p); - ldbm = BLKLDD(B, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + ldcp = BLKLDD(C, p); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -156,30 +155,29 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, L = A->nb; T = TT; } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(p, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, chameleon_min( L, tempnn ), ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(p, n), ldbp, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); @@ -189,7 +187,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, * ChamLeft / ChamConjTrans */ else { - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; @@ -203,9 +201,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbp = BLKLDD(B, p); - ldbm = BLKLDD(B, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcp = BLKLDD(C, p); + ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -217,21 +215,20 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, L = A->nb; T = TT; } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(p, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, chameleon_min(L, tempnn), ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(p, n), ldbp, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); @@ -241,12 +238,14 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { p = qrtree->getm(qrtree, k, i); - temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; - tempkmin = chameleon_min(tempkm, temppn); - ldbp = BLKLDD(B, p); + temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkm ); + + ldcp = BLKLDD(C, p); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, @@ -260,25 +259,24 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, D(k, p), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(p, n), - B->get_rankof( B, p, n ) ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + RUNTIME_data_migrate( sequence, C(p, n), + C->get_rankof( C, p, n ) ); INSERT_TASK_zunmlq( - &options, - side, trans, - temppn, tempnn, tempkmin, ib, T->nb, + &options, side, trans, + temppm, tempnn, tempkmin, ib, T->nb, D(k, p), lddk, T(k, p), T->mb, - B(p, n), ldbp); + C(p, n), ldcp); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); } - RUNTIME_iteration_pop(chamctxt); } } @@ -288,7 +286,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, */ else { if (trans == ChamNoTrans) { - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; @@ -302,7 +300,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -314,22 +312,22 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, L = tempnn; T = TT; } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, p), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, L, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, p), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -339,11 +337,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { p = qrtree->getm(qrtree, k, i); - temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; - tempkmin = chameleon_min(tempkm, temppn); + temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkm ); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, @@ -357,26 +356,24 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, D(k, p), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, p), - B->get_rankof( B, m, p ) ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + RUNTIME_data_migrate( sequence, C(m, p), + C->get_rankof( C, m, p ) ); INSERT_TASK_zunmlq( - &options, - side, trans, + &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, D(k, p), lddk, T(k, p), T->mb, - B(m, p), ldbm); + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); } - RUNTIME_iteration_pop(chamctxt); } } @@ -384,7 +381,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, * ChamRight / ChamConjTrans */ else { - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; @@ -395,11 +392,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { p = qrtree->getm(qrtree, k, i); - temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; - tempkmin = chameleon_min(tempkm, temppn); + temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkm ); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, @@ -413,18 +411,17 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, D(k, p), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + + for (m = 0; m < C->mt; m++) { + ldcm = BLKLDD(C, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmlq( - &options, - side, trans, + &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, D(k, p), lddk, T(k, p), TS->mb, - B(m, p), ldbm); + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); } @@ -436,7 +433,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -449,31 +446,30 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, T = TT; } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, p), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, L, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, p), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B( m, k ), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C( m, k ), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt); diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index c3b7d0c1d0f36636d42c1d2fa67296c283df3181..8f2931ccdba43f7f7d89f8f5f1b962a28567c4bf 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -27,7 +27,7 @@ #include "control/common.h" #define A(m,n) A, (m), (n) -#define B(m,n) B, (m), (n) +#define C(m,n) C, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), ((n)+A->nt) #define D(m,n) D, (m), (n) @@ -37,7 +37,7 @@ * Householder) - dynamic scheduling */ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -45,10 +45,11 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans size_t ws_worker = 0; size_t ws_host = 0; - int k, m, n; - int K, N, RD, lastRD; - int ldak, lddk, ldbN, ldbm, ldbNRD; - int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; + int k, m, n, p; + int KT, RD, lastRD; + int ldak, lddk; + int ldcp, ldcm; + int temppm, temppn, tempkm, tempnn, tempmm, tempkmin; int ib, node; chamctxt = chameleon_context_self(); @@ -84,62 +85,66 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - K = chameleon_min(A->mt, A->nt); + KT = chameleon_min( A->mt, A->nt ); if (side == ChamLeft ) { if (trans == ChamNoTrans) { /* * ChamLeft / ChamNoTrans */ - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkm = k == A->mt - 1 ? A->m - k * A->mb : A->mb; + ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); - for (N = k; N < A->nt; N += BS) { - tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; - tempkmin = chameleon_min(tempkm,tempNn); - ldbN = BLKLDD(B, N); + for (p = k; p < C->mt; p += BS) { + + temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkm ); + + ldcp = BLKLDD(C, p); + if ( genD ) { - int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + ChamUpper, tempkmin, tempDpn, A->nb, + A(k, p), ldak, + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempDNn, + ChamLower, tempkmin, tempDpn, 0., 1., - D(k, N), lddk ); + D(k, p), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmlq( &options, side, trans, - tempNn, tempnn, - tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - B(N, n), ldbN); + temppm, tempnn, tempkmin, ib, T->nb, + D(k, p), lddk, + T(k, p), T->mb, + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(k, N) ); - RUNTIME_data_flush( sequence, T(k, N) ); + RUNTIME_data_flush( sequence, D(k, p) ); + RUNTIME_data_flush( sequence, T(k, p) ); - for (m = N+1; m < chameleon_min(N+BS, A->nt); m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (m = p+1; m < chameleon_min(p+BS, C->mt); m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(N, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( @@ -147,350 +152,369 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm, tempnn, tempkm, 0, ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(N, n), ldbN, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); } } - for (RD = BS; RD < A->nt-k; RD *= 2) { - for (N = k; N+RD < A->nt; N += 2*RD) { - tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; - ldbN = BLKLDD(B, N ); - ldbNRD = BLKLDD(B, N+RD); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - - node = B->get_rankof( B, N+RD, n ); - RUNTIME_data_migrate( sequence, B(N, n), node ); - RUNTIME_data_migrate( sequence, B(N+RD, n), node ); + for (RD = BS; RD < C->mt-k; RD *= 2) { + for (p = k; p+RD < C->mt; p += 2*RD) { + m = p+RD; + + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + ldcp = BLKLDD(C, p); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( &options, side, trans, - tempNRDn, tempnn, tempkm, tempnn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - B (N, n), ldbN, - B (N+RD, n), ldbNRD); + tempmm, tempnn, tempkm, tempnn, ib, T->nb, + A (k, m), ldak, + T2(k, m), T->mb, + C (p, n), ldcp, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (k, N+RD) ); - RUNTIME_data_flush( sequence, T2(k, N+RD) ); + RUNTIME_data_flush( sequence, A (k, m) ); + RUNTIME_data_flush( sequence, T2(k, m) ); } } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); } - } else { - /* - * ChamLeft / ChamConjTrans - */ - for (k = K-1; k >= 0; k--) { + } + /* + * ChamLeft / ChamConjTrans + */ + else { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); + lastRD = 0; - for (RD = BS; RD < A->nt-k; RD *= 2) + for (RD = BS; RD < C->mt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { - for (N = k; N+RD < A->nt; N += 2*RD) { - tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; - ldbN = BLKLDD(B, N ); - ldbNRD = BLKLDD(B, N+RD); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (p = k; p+RD < C->mt; p += 2*RD) { + m = p+RD; - node = B->get_rankof( B, N+RD, n ); - RUNTIME_data_migrate( sequence, B(N, n), node ); - RUNTIME_data_migrate( sequence, B(N+RD, n), node ); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + ldcp = BLKLDD(C, p); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( - &options, - side, trans, - tempNRDn, tempnn, tempkm, tempnn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - B (N, n), ldbN, - B (N+RD, n), ldbNRD); + &options, side, trans, + tempmm, tempnn, tempkm, tempnn, ib, T->nb, + A (k, m), ldak, + T2(k, m), T->mb, + C (p, n), ldcp, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (k, N+RD) ); - RUNTIME_data_flush( sequence, T2(k, N+RD) ); + RUNTIME_data_flush( sequence, A (k, m) ); + RUNTIME_data_flush( sequence, T2(k, m) ); } } - for (N = k; N < A->nt; N += BS) { - tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; - tempkmin = chameleon_min(tempkm,tempNn); - ldbN = BLKLDD(B, N); - for (m = chameleon_min(N+BS, A->nt)-1; m > N; m--) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(N, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + for (p = k; p < C->mt; p += BS) { + ldcp = BLKLDD(C, p); + + for (m = chameleon_min(p+BS, C->mt)-1; m > p; m--) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, A(k, m), ldak, T(k, m), T->mb, - B(N, n), ldbN, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); } + + temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkm ); + if ( genD ) { - int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + ChamUpper, tempkmin, tempDpn, A->nb, + A(k, p), ldak, + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempDNn, + ChamLower, tempkmin, tempDpn, 0., 1., - D(k, N), lddk ); + D(k, p), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(N, n), - B->get_rankof( B, N, n ) ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + RUNTIME_data_migrate( sequence, C(p, n), + C->get_rankof( C, p, n ) ); INSERT_TASK_zunmlq( - &options, - side, trans, - tempNn, tempnn, - tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - B(N, n), ldbN); + &options, side, trans, + temppm, tempnn, tempkmin, ib, T->nb, + D(k, p), lddk, + T(k, p), T->mb, + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(k, N) ); - RUNTIME_data_flush( sequence, T(k, N) ); + RUNTIME_data_flush( sequence, D(k, p) ); + RUNTIME_data_flush( sequence, T(k, p) ); } RUNTIME_iteration_pop(chamctxt); } } } + /* + * ChamRight / ChamNoTrans + */ else { if (trans == ChamNoTrans) { - /* - * ChamRight / ChamNoTrans - */ - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); lastRD = 0; - for (RD = BS; RD < A->nt-k; RD *= 2) + for (RD = BS; RD < C->nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { - for (N = k; N+RD < A->nt; N += 2*RD) { - tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + for (p = k; p+RD < C->nt; p += 2*RD) { + n = p+RD; + + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, N+RD ); - RUNTIME_data_migrate( sequence, B(m, N), node ); - RUNTIME_data_migrate( sequence, B(m, N+RD), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( - &options, - side, trans, - tempmm, tempNRDn, tempkm, tempNRDn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - B (m, N ), ldbm, - B (m, N+RD), ldbm); + &options, side, trans, + tempmm, tempnn, tempkm, tempnn, ib, T->nb, + A (k, n), ldak, + T2(k, n), T->mb, + C (m, p), ldcm, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (k, N+RD) ); - RUNTIME_data_flush( sequence, T2(k, N+RD) ); + RUNTIME_data_flush( sequence, A (k, n) ); + RUNTIME_data_flush( sequence, T2(k, n) ); } } - for (N = k; N < A->nt; N += BS) { - tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; - tempkmin = chameleon_min(tempkm,tempNn); - for (n = chameleon_min(N+BS, A->nt)-1; n > N; n--) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, N), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + for (p = k; p < C->nt; p += BS) { + + for (n = chameleon_min(p+BS, C->nt)-1; n > p; n--) { + + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, N), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); } + + temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkm ); + if ( genD ) { - int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + ChamUpper, tempkmin, tempDpn, A->nb, + A(k, p), ldak, + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempDNn, + ChamLower, tempkmin, tempDpn, 0., 1., - D(k, N), lddk ); + D(k, p), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - RUNTIME_data_migrate( sequence, B(m, N), - B->get_rankof( B, m, N ) ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + RUNTIME_data_migrate( sequence, C(m, p), + C->get_rankof( C, m, p ) ); INSERT_TASK_zunmlq( - &options, - side, trans, - tempmm, tempNn, - tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - B(m, N), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(k, p), lddk, + T(k, p), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(k, N) ); - RUNTIME_data_flush( sequence, T(k, N) ); + RUNTIME_data_flush( sequence, D(k, p) ); + RUNTIME_data_flush( sequence, T(k, p) ); } - RUNTIME_iteration_pop(chamctxt); } - } else { - /* - * ChamRight / ChamConjTrans - */ - for (k = 0; k < K; k++) { + } + /* + * ChamRight / ChamConjTrans + */ + else { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); - for (N = k; N < A->nt; N += BS) { - tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; - tempkmin = chameleon_min(tempkm,tempNn); + + for (p = k; p < C->nt; p += BS) { + temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkm ); + if ( genD ) { - int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; + int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; + INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + ChamUpper, tempkmin, tempDpn, A->nb, + A(k, p), ldak, + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempDNn, + ChamLower, tempkmin, tempDpn, 0., 1., - D(k, N), lddk ); + D(k, p), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + + for (m = 0; m < C->mt; m++) { + ldcm = BLKLDD(C, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmlq( - &options, - side, trans, - tempmm, tempNn, - tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - B(m, N), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(k, p), lddk, + T(k, p), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(k, N) ); - RUNTIME_data_flush( sequence, T(k, N) ); + RUNTIME_data_flush( sequence, D(k, p) ); + RUNTIME_data_flush( sequence, T(k, p) ); - for (n = N+1; n < chameleon_min(N+BS, A->nt); n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (n = p+1; n < chameleon_min(p+BS, C->nt); n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, N), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmlqt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, A(k, n), ldak, T(k, n), T->mb, - B(m, N), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); } } - for (RD = BS; RD < A->nt-k; RD *= 2) { - for (N = k; N+RD < A->nt; N += 2*RD) { - tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (RD = BS; RD < C->nt-k; RD *= 2) { + for (p = k; p+RD < C->nt; p += 2*RD) { + n = p + RD; + tempnn = n == C->mt-1 ? C->m-n*C->mb : C->mb; + + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, N+RD ); - RUNTIME_data_migrate( sequence, B(m, N), node ); - RUNTIME_data_migrate( sequence, B(m, N+RD), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmlqt( &options, side, trans, - tempmm, tempNRDn, tempkm, tempNRDn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - B (m, N ), ldbm, - B (m, N+RD), ldbm); + tempmm, tempnn, tempkm, tempnn, ib, T->nb, + A (k, n), ldak, + T2(k, n), T->mb, + C (m, p), ldcm, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (k, N+RD) ); - RUNTIME_data_flush( sequence, T2(k, N+RD) ); + RUNTIME_data_flush( sequence, A (k, n) ); + RUNTIME_data_flush( sequence, T2(k, n) ); } } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt); diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index c872ea0de59ea6122a7cd358cbd210d147c5da90..3a9e93bbee000aab330e562ee82bb6befd44d5db 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -27,7 +27,7 @@ #include "control/common.h" #define A(m,n) A, m, n -#define B(m,n) B, m, n +#define C(m,n) C, m, n #define T(m,n) T, m, n #define D(k) D, k, k @@ -35,7 +35,7 @@ * Parallel application of Q using tile V - QR factorization - dynamic scheduling */ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -44,9 +44,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldbk, ldam, ldan, ldbm, lddk; - int tempkm, tempnn, tempkmin, tempmm, tempkn; - int ib, minMT, minM; + int ldak, ldck, ldam, ldan, ldcm, lddk; + int tempkm, tempkn, tempkmin, tempmm, tempnn; + int ib, KT, K; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -57,11 +57,11 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, ib = CHAMELEON_IB; if (A->m > A->n) { - minM = A->n; - minMT = A->nt; + KT = A->nt; + K = A->n; } else { - minM = A->m; - minMT = A->mt; + KT = A->mt; + K = A->m; } if ( D == NULL ) { @@ -94,14 +94,16 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, /* * ChamLeft / ChamConjTrans */ - for (k = 0; k < minMT; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; + tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; + ldak = BLKLDD(A, k); + ldck = BLKLDD(C, k); lddk = BLKLDD(D, k); - ldbk = BLKLDD(B, k); + if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; @@ -118,29 +120,29 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, D(k), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmqr( &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(k, n), ldbk); + C(k, n), ldck); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); - for (m = k+1; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + for (m = k+1; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + ldcm = BLKLDD(C, m); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -149,8 +151,8 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(k, n), ldbk, - B(m, n), ldbm); + C(k, n), ldck, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); @@ -158,9 +160,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); @@ -170,23 +172,25 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, * ChamLeft / ChamNoTrans */ else { - for (k = minMT-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; + tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; + ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); + ldck = BLKLDD(C, k); lddk = BLKLDD(D, k); - for (m = B->mt-1; m > k; m--) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + + for (m = C->mt-1; m > k; m--) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + ldcm = BLKLDD(C, m); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -195,32 +199,34 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(k, n), ldbk, - B(m, n), ldbm); + C(k, n), ldck, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); } if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkm, tempkmin, + ChamUpper, tempDkm, tempkmin, 0., 1., D(k), lddk ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); INSERT_TASK_zunmqr( &options, @@ -228,7 +234,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkm, tempnn, tempkmin, ib, T->nb, D(k), lddk, T(k, k), T->mb, - B(k, n), ldbk); + C(k, n), ldck); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); @@ -241,22 +247,23 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, */ else { if (trans == ChamConjTrans) { - for (k = minMT-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb; - tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb; + tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); - for (n = B->nt-1; n > k; n--) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + + for (n = C->nt-1; n > k; n--) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -265,41 +272,44 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, k), ldbm, - B(m, n), ldbm); + C(m, k), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); } + if ( genD ) { + int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempkn, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A->nb, A(k, k), ldak, D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkn, tempkmin, + ChamUpper, tempDkm, tempkmin, 0., 1., D(k), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); INSERT_TASK_zunmqr( &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, + D(k), lddk, T(k, k), T->mb, - B(m, k), ldbm); + C(m, k), ldcm); } RUNTIME_data_flush( sequence, D(k) ); @@ -312,51 +322,54 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, * ChamRight / ChamNoTrans */ else { - for (k = 0; k < minMT; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; - tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; + tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; + tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; ldak = BLKLDD(A, k); lddk = BLKLDD(D, k); + if ( genD ) { + int tempDkm = k == D->mt - 1 ? D->m - k * D->mb : D->mb; + INSERT_TASK_zlacpy( &options, - ChamLower, tempkn, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A->nb, A(k, k), ldak, - D(k), lddk ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempkn, tempkmin, + ChamUpper, tempDkm, tempkmin, 0., 1., D(k), lddk ); #endif } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); INSERT_TASK_zunmqr( &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, + D(k), lddk, T(k, k), T->mb, - B(m, k), ldbm); + C(m, k), ldcm); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); - for (n = k+1; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = k+1; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, n ) ); + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, n ) ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -365,8 +378,8 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, k), ldbm, - B(m, n), ldbm); + C(m, k), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); @@ -374,9 +387,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt); diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index 923a3f8a73a5dc53f07804fe23e357d3ae14c708..a3905647c0767cbaad4f4a8be98e1397b6996924 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -22,7 +22,7 @@ #include <stdlib.h> #define A(m,n) A, m, n -#define B(m,n) B, m, n +#define C(m,n) C, m, n #define T(m,n) T, m, n #define D(m,n) D, m, n @@ -31,7 +31,7 @@ */ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { @@ -42,9 +42,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldam, ldan, ldbm, ldbp, lddn, lddm; - int tempnn, tempkmin, tempmm, tempkn; - int ib, K, L; + int ldap, ldam, ldan, lddp, ldcp, ldcm; + int temppm, temppn, tempmm, tempnn, tempkn,tempkmin; + int ib, KT, L; int node, nbtiles, *tiles; chamctxt = chameleon_context_self(); @@ -55,8 +55,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ib = CHAMELEON_IB; - K = chameleon_min(A->mt, A->nt); - if ( D == NULL ) { D = A; genD = 0; @@ -71,7 +69,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, #if defined(CHAMELEON_USE_CUDA) /* Worker space * - * zunmqr = A->nb * ib + * zunmqr = A->nb * ib * ztpmqrt = 3 * A->nb * ib */ ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); @@ -85,54 +83,55 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, /* Initialisation of tiles */ tiles = (int*)calloc( qrtree->mt, sizeof(int) ); + KT = chameleon_min( A->mt, A->nt ); if (side == ChamLeft ) { if (trans == ChamConjTrans) { /* * ChamLeft / ChamConjTrans */ - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + tempkn = k == A->nt - 1 ? A->n - k * A->nb : A->nb; T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { - m = qrtree->getm(qrtree, k, i); + p = qrtree->getm(qrtree, k, i); - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkmin = chameleon_min(tempmm, tempkn); - ldam = BLKLDD(A, m); - lddm = BLKLDD(D, m); - ldbm = BLKLDD(B, m); + temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkn ); + + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + ldcp = BLKLDD(C, p); if ( genD ) { - int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDmm, tempkmin, A->nb, - A(m, k), ldam, - D(m, k), lddm ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDmm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(m, k), lddm ); + D(p, k), lddp ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), lddm, - T(m, k), T->mb, - B(m, n), ldbm); + &options, side, trans, + temppm, tempnn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } /* Setting the order of the tiles*/ @@ -142,10 +141,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldbp = BLKLDD(B, p); + ldcp = BLKLDD(C, p); + ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -157,30 +156,29 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, L = tempmm; T = TT; } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(p, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmqrt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkn, L, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(p, n), ldbp, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); @@ -190,7 +188,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, * ChamLeft / ChamNoTrans */ else { - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -198,14 +196,14 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, /* Setting the order of the tiles*/ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = nbtiles-1; i >=0; i--) { + for (i = nbtiles-1; i >= 0; i--) { m = tiles[i]; p = qrtree->currpiv(qrtree, k, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldbp = BLKLDD(B, p); + ldcp = BLKLDD(C, p); + ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -217,21 +215,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, L = tempmm; T = TT; } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(p, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmqrt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkn, L, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(p, n), ldbp, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -239,49 +236,48 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { - m = qrtree->getm(qrtree, k, i); + p = qrtree->getm(qrtree, k, i); - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkmin = chameleon_min(tempmm, tempkn); - ldam = BLKLDD(A, m); - lddm = BLKLDD(D, m); - ldbm = BLKLDD(B, m); + temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkn ); + + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + ldcp = BLKLDD(C, p); if ( genD ) { - int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDmm, tempkmin, A->nb, - A(m, k), ldam, - D(m, k), lddm ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDmm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(m, k), lddm ); + D(p, k), lddp ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + RUNTIME_data_migrate( sequence, C(p, n), + C->get_rankof( C, p, n ) ); INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), lddm, - T(m, k), T->mb, - B(m, n), ldbm); + &options, side, trans, + temppm, tempnn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(p, n), ldcp); } - - RUNTIME_data_flush( sequence, D(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } - RUNTIME_iteration_pop(chamctxt); } } @@ -291,10 +287,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, */ else { if (trans == ChamConjTrans) { - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n - k*A->nb : A->nb; + tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb; /* Setting the order of the tiles*/ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); @@ -303,7 +299,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { @@ -317,22 +313,21 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, T = TT; } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, p), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmqrt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkn, chameleon_min( L, tempmm ), ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, p), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); @@ -340,46 +335,46 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { - n = qrtree->getm(qrtree, k, i); + p = qrtree->getm(qrtree, k, i); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - tempkmin = chameleon_min(tempnn, tempkn); - ldan = BLKLDD(A, n); - lddn = BLKLDD(D, n); + temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min(temppn, tempkn); + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); if ( genD ) { - int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDnn, tempkmin, A->nb, - A(n, k), ldan, - D(n, k), lddn ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDnn, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(n, k), lddn ); + D(p, k), lddp ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, n), - B->get_rankof( B, m, n ) ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + RUNTIME_data_migrate( sequence, C(m, p), + C->get_rankof( C, m, p ) ); INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(n, k), lddn, - T(n, k), T->mb, - B(m, n), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(n, k) ); - RUNTIME_data_flush( sequence, T(n, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } RUNTIME_iteration_pop(chamctxt); } @@ -388,49 +383,49 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, * ChamRight / ChamNoTrans */ else { - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { - n = qrtree->getm(qrtree, k, i); + p = qrtree->getm(qrtree, k, i); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - tempkmin = chameleon_min(tempnn, tempkn); - ldan = BLKLDD(A, n); - lddn = BLKLDD(D, n); + temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkn ); + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); if ( genD ) { - int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDnn, tempkmin, A->nb, - A(n, k), ldan, - D(n, k), lddn ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDnn, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(n, k), lddn ); + D(p, k), lddp ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + + for (m = 0; m < C->mt; m++) { + ldcm = BLKLDD(C, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(n, k), lddn, - T(n, k), T->mb, - B(m, n), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(n, k) ); - RUNTIME_data_flush( sequence, T(n, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } /* Setting the order of tiles */ @@ -440,7 +435,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, n = tiles[i]; p = qrtree->currpiv(qrtree, k, n); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { @@ -454,31 +449,30 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, T = TT; } - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, p), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); INSERT_TASK_ztpmqrt( - &options, - side, trans, + &options, side, trans, tempmm, tempnn, tempkn, chameleon_min( L, tempmm ), ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, p), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C( m, k ), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt); diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 7ba65c846a48672612bc5554cd6aa8ca7b367b94..1d4500f849c3072d7bbf9e54e0f25c72363a5fbd 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -27,7 +27,7 @@ #include "control/common.h" #define A(m,n) A, (m), (n) -#define B(m,n) B, (m), (n) +#define C(m,n) C, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), ((n)+A->nt) #define D(m,n) D, (m), (n) @@ -37,7 +37,7 @@ * Householder) - dynamic scheduling */ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans, - CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, + CHAM_desc_t *A, CHAM_desc_t *C, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -45,11 +45,12 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans size_t ws_worker = 0; size_t ws_host = 0; - int k, m, n; - int K, M, RD, lastRD; - int ldaM, ldam, ldan, ldaMRD, lddM; - int ldbM, ldbm, ldbMRD; - int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; + int k, m, n, p; + int KT, RD, lastRD; + int ldap, ldam, ldan; + int ldcp, ldcm; + int lddp; + int temppm, temppn, tempkn, tempnn, tempmm, tempkmin; int ib, node; chamctxt = chameleon_context_self(); @@ -85,61 +86,66 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - K = chameleon_min(A->mt, A->nt); + KT = chameleon_min( A->mt, A->nt ); if (side == ChamLeft ) { if (trans == ChamConjTrans) { /* * ChamLeft / ChamConjTrans */ - for (k = 0; k < K; k++) { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - for (M = k; M < A->mt; M += BS) { - tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; - tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); - ldbM = BLKLDD(B, M); + tempkn = k == A->nt - 1 ? A->n - k * A->nb : A->nb; + + for (p = k; p < C->mt; p += BS) { + + temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkn ); + + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + ldcp = BLKLDD(C, p); + if ( genD ) { - int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDMm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(M, k), lddM ); + D(p, k), lddp ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; INSERT_TASK_zunmqr( &options, side, trans, - tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - B(M, n), ldbM); + temppm, tempnn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(M, k) ); - RUNTIME_data_flush( sequence, T(M, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); - for (m = M+1; m < chameleon_min(M+BS, A->mt); m++) { - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldbm = BLKLDD(B, m); + for (m = p+1; m < chameleon_min(p+BS, C->mt); m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + ldcm = BLKLDD(C, m); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(M, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -147,101 +153,108 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm, tempnn, tempkn, 0, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(M, n), ldbM, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); } } - for (RD = BS; RD < A->mt-k; RD *= 2) { - for (M = k; M+RD < A->mt; M += 2*RD) { - tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldbM = BLKLDD(B, M ); - ldbMRD = BLKLDD(B, M+RD); - ldaMRD = BLKLDD(A, M+RD); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - - node = B->get_rankof( B, M+RD, n ); - RUNTIME_data_migrate( sequence, B(M, n), node ); - RUNTIME_data_migrate( sequence, B(M+RD, n), node ); + for (RD = BS; RD < C->mt-k; RD *= 2) { + for (p = k; p+RD < C->mt; p += 2*RD) { + m = p+RD; + + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldam = BLKLDD(A, m); + ldcm = BLKLDD(C, m); + ldcp = BLKLDD(C, p); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( &options, side, trans, - tempMRDm, tempnn, tempkn, tempMRDm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - B (M, n), ldbM, - B (M+RD, n), ldbMRD); + tempmm, tempnn, tempkn, tempmm, ib, T->nb, + A (m, k), ldam, + T2(m, k), T->mb, + C (p, n), ldcp, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (M+RD, k) ); - RUNTIME_data_flush( sequence, T2(M+RD, k) ); + RUNTIME_data_flush( sequence, A (m, k) ); + RUNTIME_data_flush( sequence, T2(m, k) ); } } /* Restore the original location of the tiles */ - for (n = 0; n < B->nt; n++) { - RUNTIME_data_migrate( sequence, B(k, n), - B->get_rankof( B, k, n ) ); + for (n = 0; n < C->nt; n++) { + RUNTIME_data_migrate( sequence, C(k, n), + C->get_rankof( C, k, n ) ); } RUNTIME_iteration_pop(chamctxt); } - } else { - /* - * ChamLeft / ChamNoTrans - */ - for (k = K-1; k >= 0; k--) { + } + /* + * ChamLeft / ChamNoTrans + */ + else { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; lastRD = 0; - for (RD = BS; RD < A->mt-k; RD *= 2) + for (RD = BS; RD < C->mt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { - for (M = k; M+RD < A->mt; M += 2*RD) { - tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldbM = BLKLDD(B, M ); - ldbMRD = BLKLDD(B, M+RD); - ldaMRD = BLKLDD(A, M+RD); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - - node = B->get_rankof( B, M+RD, n ); - RUNTIME_data_migrate( sequence, B(M, n), node ); - RUNTIME_data_migrate( sequence, B(M+RD, n), node ); + for (p = k; p+RD < C->mt; p += 2*RD) { + m = p+RD; + + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldam = BLKLDD(A, m); + ldcm = BLKLDD(C, m); + ldcp = BLKLDD(C, p); + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( &options, side, trans, - tempMRDm, tempnn, tempkn, tempMRDm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - B (M, n), ldbM, - B (M+RD, n), ldbMRD); + tempmm, tempnn, tempkn, tempmm, ib, T->nb, + A (m, k), ldam, + T2(m, k), T->mb, + C (p, n), ldcp, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (M+RD, k) ); - RUNTIME_data_flush( sequence, T2(M+RD, k) ); + RUNTIME_data_flush( sequence, A (m, k) ); + RUNTIME_data_flush( sequence, T2(m, k) ); } } - for (M = k; M < A->mt; M += BS) { - tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; - tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); - ldbM = BLKLDD(B, M); - for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) { - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldbm = BLKLDD(B, m); + for (p = k; p < C->mt; p += BS) { + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + ldcp = BLKLDD(C, p); + + for (m = chameleon_min(p+BS, C->mt)-1; m > p; m--) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldam = BLKLDD(A, m); - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(M, n), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(p, n), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -249,100 +262,108 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm, tempnn, tempkn, 0, ib, T->nb, A(m, k), ldam, T(m, k), T->mb, - B(M, n), ldbM, - B(m, n), ldbm); + C(p, n), ldcp, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); } + + temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; + tempkmin = chameleon_min( temppm, tempkn ); + if ( genD ) { - int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDMm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(M, k), lddM ); + D(p, k), lddp ); #endif } - for (n = 0; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - RUNTIME_data_migrate( sequence, B(M, n), - B->get_rankof( B, M, n ) ); + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + RUNTIME_data_migrate( sequence, C(p, n), + C->get_rankof( C, p, n ) ); INSERT_TASK_zunmqr( &options, side, trans, - tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - B(M, n), ldbM); + temppm, tempnn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(p, n), ldcp); } - RUNTIME_data_flush( sequence, D(M, k) ); - RUNTIME_data_flush( sequence, T(M, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } RUNTIME_iteration_pop(chamctxt); } } } + /* + * ChamRight / ChamConjTrans + */ else { if (trans == ChamConjTrans) { - /* - * ChamRight / ChamConjTrans - */ - for (k = K-1; k >= 0; k--) { + for (k = KT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb; + lastRD = 0; - for (RD = BS; RD < A->mt-k; RD *= 2) + for (RD = BS; RD < C->nt-k; RD *= 2) lastRD = RD; for (RD = lastRD; RD >= BS; RD /= 2) { - for (M = k; M+RD < A->mt; M += 2*RD) { - tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldaMRD = BLKLDD(A, M+RD); - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + for (p = k; p+RD < C->nt; p += 2*RD) { + n = p+RD; + + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + ldan = BLKLDD(A, n); + + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, M+RD ); - RUNTIME_data_migrate( sequence, B(m, M), node ); - RUNTIME_data_migrate( sequence, B(m, M+RD), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( &options, side, trans, - tempmm, tempMRDm, tempkn, tempmm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - B (m, M), ldbm, - B (m, M+RD), ldbm); + tempmm, tempnn, tempkn, tempmm, ib, T->nb, + A (n, k), ldan, + T2(n, k), T->mb, + C (m, p), ldcm, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (M+RD, k) ); - RUNTIME_data_flush( sequence, T2(M+RD, k) ); + RUNTIME_data_flush( sequence, A (n, k) ); + RUNTIME_data_flush( sequence, T2(n, k) ); } } - for (M = k; M < A->mt; M += BS) { - tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; - tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); - for (n = chameleon_min(M+BS, A->mt)-1; n > M; n--) { + for (p = k; p < C->nt; p += BS) { + + for (n = chameleon_min(p+BS, C->nt)-1; n > p; n--) { + + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, M), node ); - RUNTIME_data_migrate( sequence, B(m, m), node ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -350,102 +371,111 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm, tempnn, tempkn, 0, ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, M), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); } + + temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkn ); + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + if ( genD ) { - int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDMm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(M, k), lddM ); + D(p, k), lddp ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - RUNTIME_data_migrate( sequence, B(m, M), - B->get_rankof( B, m, M ) ); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + RUNTIME_data_migrate( sequence, C(m, p), + C->get_rankof( C, m, p ) ); INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempMm, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - B(m, M), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(M, k) ); - RUNTIME_data_flush( sequence, T(M, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } - RUNTIME_iteration_pop(chamctxt); } - } else { - /* - * ChamRight / ChamNoTrans - */ - for (k = 0; k < K; k++) { + } + /* + * ChamRight / ChamNoTrans + */ + else { + for (k = 0; k < KT; k++) { RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - for (M = k; M < A->mt; M += BS) { - tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; - tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); + + for (p = k; p < C->nt; p += BS) { + temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; + tempkmin = chameleon_min( temppn, tempkn ); + + ldap = BLKLDD(A, p); + lddp = BLKLDD(D, p); + if ( genD ) { - int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + ChamLower, tempDpm, tempkmin, A->nb, + A(p, k), ldap, + D(p, k), lddp ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempDMm, tempkmin, + ChamUpper, tempDpm, tempkmin, 0., 1., - D(M, k), lddM ); + D(p, k), lddp ); #endif } - for (m = 0; m < B->mt; m++) { - ldbm = BLKLDD(B, m); - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; + + for (m = 0; m < C->mt; m++) { + ldcm = BLKLDD(C, m); + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmqr( - &options, - side, trans, - tempmm, tempMm, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - B(m, M), ldbm); + &options, side, trans, + tempmm, temppn, tempkmin, ib, T->nb, + D(p, k), lddp, + T(p, k), T->mb, + C(m, p), ldcm); } - RUNTIME_data_flush( sequence, D(M, k) ); - RUNTIME_data_flush( sequence, T(M, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); - for (n = M+1; n < chameleon_min(M+BS, A->mt); n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + for (n = p+1; n < chameleon_min(p+BS, C->nt); n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; ldan = BLKLDD(A, n); - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - node = B->get_rankof( B, m, n ); - RUNTIME_data_migrate( sequence, B(m, M), node ); - RUNTIME_data_migrate( sequence, B(m, n), node ); + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TS kernel */ INSERT_TASK_ztpmqrt( @@ -453,43 +483,45 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempmm, tempnn, tempkn, 0, ib, T->nb, A(n, k), ldan, T(n, k), T->mb, - B(m, M), ldbm, - B(m, n), ldbm); + C(m, p), ldcm, + C(m, n), ldcm); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); } } - for (RD = BS; RD < A->mt-k; RD *= 2) { - for (M = k; M+RD < A->mt; M += 2*RD) { - tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldaMRD = BLKLDD(A, M+RD); - for (m = 0; m < B->mt; m++) { - tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - - node = B->get_rankof( B, m, M+RD ); - RUNTIME_data_migrate( sequence, B(m, M), node ); - RUNTIME_data_migrate( sequence, B(m, M+RD), node ); + for (RD = BS; RD < C->nt-k; RD *= 2) { + for (p = k; p+RD < C->nt; p += 2*RD) { + n = p + RD; + tempnn = n == C->mt-1 ? C->m-n*C->mb : C->mb; + ldan = BLKLDD(A, n); + + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + node = C->get_rankof( C, m, n ); + RUNTIME_data_migrate( sequence, C(m, p), node ); + RUNTIME_data_migrate( sequence, C(m, n), node ); /* TT kernel */ INSERT_TASK_ztpmqrt( &options, side, trans, - tempmm, tempMRDm, tempkn, tempmm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - B (m, M ), ldbm, - B (m, M+RD), ldbm); + tempmm, tempnn, tempkn, tempmm, ib, T->nb, + A (n, k), ldan, + T2(n, k), T->mb, + C (m, p), ldcm, + C (m, n), ldcm); } - RUNTIME_data_flush( sequence, A (M+RD, k) ); - RUNTIME_data_flush( sequence, T2(M+RD, k) ); + RUNTIME_data_flush( sequence, A (n, k) ); + RUNTIME_data_flush( sequence, T2(n, k) ); } } /* Restore the original location of the tiles */ - for (m = 0; m < B->mt; m++) { - RUNTIME_data_migrate( sequence, B(m, k), - B->get_rankof( B, m, k ) ); + for (m = 0; m < C->mt; m++) { + RUNTIME_data_migrate( sequence, C(m, k), + C->get_rankof( C, m, k ) ); } RUNTIME_iteration_pop(chamctxt);