diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index 0831540e19f302dc8477787b3f48ec9ae798e72e..418d9ee8b797ab416109a57d90cc90a331d2fa72 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -30,8 +30,148 @@ * Parallel tile QR factorization (reduction Householder) - dynamic scheduling * * @param[in] genD - * Indicate if the copies of the geqrt tiles must be done to speedup - * computations in updates. + * Indicate if copies of the geqrt tiles must be done to speedup + * computations in updates. genD is considered only if D is not NULL. + * + * @param[in] uplo + * - ChamLower: Classic QR factorization of the matrix A. + * - ChamUpper: QR factorization of the TTQRT kernel. + * - ChamUpperLower: QR factorization of the TSQRT kernel. + */ +int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, + const libhqr_tree_t *qrtree, int *tiles, + CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_option_t *options, RUNTIME_sequence_t *sequence ) +{ + CHAM_desc_t *T; + int m, n, i, p; + int L, nbgeqrt; + int tempkmin, tempkn, tempnn, tempmm; + int node, nbtiles; + + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + + /* The number of geqrt to apply */ + nbgeqrt = qrtree->getnbgeqrf( qrtree, k ); + + T = TS; + for (i = 0; i < nbgeqrt; i++) { + m = qrtree->getm( qrtree, k, i ); + + /* We skip the QR factorization if this is the last diagonal tile */ + if ( (uplo == ChamUpper) && (m == k) ) { + continue; + } + + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + tempkmin = chameleon_min(tempmm, tempkn); + + INSERT_TASK_zgeqrt( + options, + tempmm, tempkn, ib, T->nb, + A(m, k), T(m, k) ); + + if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; + + INSERT_TASK_zlacpy( + options, + ChamLower, tempDmm, tempDkn, A->nb, + A(m, k), D(m, k) ); +#if defined(CHAMELEON_USE_CUDA) + INSERT_TASK_zlaset( + options, + ChamUpper, tempDmm, tempDkn, + 0., 1., + D(m, k) ); +#endif + } + + for (n = k+1; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_zunmqr( + options, + ChamLeft, ChamConjTrans, + tempmm, tempnn, tempkmin, ib, T->nb, + D(m, k), + T(m, k), + A(m, n)); + } + + if ( genD || ((k+1) < A->nt)) { + RUNTIME_data_flush( sequence, D(m, k) ); + } + RUNTIME_data_flush( sequence, T(m, k) ); + } + + /* Setting the order of the tiles */ + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); + + for (i = 0; i < nbtiles; i++) { + m = tiles[i]; + p = qrtree->currpiv( qrtree, k, m ); + + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + + if ( qrtree->gettype( qrtree, k, m ) == LIBHQR_KILLED_BY_TS ) { + /* TS kernel */ + T = TS; + L = 0; + + /* Force TT kernel if this is the last diagonal tile */ + if ( (uplo == ChamUpper) && (m == k) ) { + L = tempmm; + } + } + else { + /* TT kernel */ + T = TT; + L = tempmm; + } + + node = A->get_rankof( A, m, k ); + RUNTIME_data_migrate( sequence, A(p, k), node ); + RUNTIME_data_migrate( sequence, A(m, k), node ); + + INSERT_TASK_ztpqrt( + options, + tempmm, tempkn, chameleon_min(L, tempkn), ib, T->nb, + A(p, k), + A(m, k), + T(m, k)); + + for (n = k+1; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + + node = A->get_rankof( A, m, n ); + RUNTIME_data_migrate( sequence, A(p, n), node ); + RUNTIME_data_migrate( sequence, A(m, n), node ); + + INSERT_TASK_ztpmqrt( + options, + ChamLeft, ChamConjTrans, + tempmm, tempnn, A->nb, L, ib, T->nb, + A(m, k), + T(m, k), + A(p, n), + A(m, n)); + } + RUNTIME_data_flush( sequence, A(m, k) ); + RUNTIME_data_flush( sequence, T(m, k) ); + } + + return tiles[nbtiles]; +} + + +/** + * Parallel tile QR factorization (reduction Householder) - dynamic scheduling + * + * @param[in] genD + * Indicate if copies of the geqrt tiles must be done to speedup + * computations in updates. genD is considered only if D is not NULL. + * */ void chameleon_pzgeqrf_param( int genD, int K, const libhqr_tree_t *qrtree, CHAM_desc_t *A, @@ -40,14 +180,11 @@ void chameleon_pzgeqrf_param( int genD, int K, { CHAM_context_t *chamctxt; RUNTIME_option_t options; - CHAM_desc_t *T; size_t ws_worker = 0; size_t ws_host = 0; - int k, m, n, i, p; - int L, nbgeqrt; - int tempkmin, tempkn, tempnn, tempmm; - int ib, node, nbtiles, *tiles; + int k, n; + int ib, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -57,7 +194,7 @@ void chameleon_pzgeqrf_param( int genD, int K, ib = CHAMELEON_IB; - if ( D == NULL ) { + if ( (genD == 0) || (D == NULL) ) { D = A; genD = 0; } @@ -85,109 +222,13 @@ void chameleon_pzgeqrf_param( int genD, int K, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); /* Initialisation of temporary tiles array */ - tiles = (int*)calloc(qrtree->mt, sizeof(int)); + tiles = (int*)calloc( qrtree->mt, sizeof(int) ); for (k = 0; k < K; k++) { - RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - - /* The number of geqrt to apply */ - nbgeqrt = qrtree->getnbgeqrf(qrtree, k); - - T = TS; - for (i = 0; i < nbgeqrt; i++) { - m = qrtree->getm(qrtree, k, i); - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkmin = chameleon_min(tempmm, tempkn); - - INSERT_TASK_zgeqrt( - &options, - tempmm, tempkn, ib, T->nb, - A(m, k), - T(m, k)); - - if ( genD ) { - int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; - int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; + RUNTIME_iteration_push( chamctxt, k ); - INSERT_TASK_zlacpy( - &options, - ChamLower, tempDmm, tempDkn, A->nb, - A(m, k), - D(m, k) ); -#if defined(CHAMELEON_USE_CUDA) - INSERT_TASK_zlaset( - &options, - ChamUpper, tempDmm, tempDkn, - 0., 1., - D(m, k) ); -#endif - } - - for (n = k+1; n < A->nt; n++) { - tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - INSERT_TASK_zunmqr( - &options, - ChamLeft, ChamConjTrans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), - T(m, k), - A(m, n)); - } - RUNTIME_data_flush( sequence, D(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); - } - - /* Setting the order of the tiles */ - nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - - for (i = 0; i < nbtiles; i++) { - m = tiles[i]; - p = qrtree->currpiv(qrtree, k, m); - - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - - if ( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { - /* TS kernel */ - T = TS; - L = 0; - } - else { - /* TT kernel */ - T = TT; - L = tempmm; - } - - node = A->get_rankof( A, m, k ); - RUNTIME_data_migrate( sequence, A(p, k), node ); - RUNTIME_data_migrate( sequence, A(m, k), node ); - - INSERT_TASK_ztpqrt( - &options, - tempmm, tempkn, chameleon_min(L, tempkn), ib, T->nb, - A(p, k), - A(m, k), - T(m, k)); - - for (n = k+1; n < A->nt; n++) { - tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - - node = A->get_rankof( A, m, n ); - RUNTIME_data_migrate( sequence, A(p, n), node ); - RUNTIME_data_migrate( sequence, A(m, n), node ); - - INSERT_TASK_ztpmqrt( - &options, - ChamLeft, ChamConjTrans, - tempmm, tempnn, A->nb, L, ib, T->nb, - A(m, k), - T(m, k), - A(p, n), - A(m, n)); - } - RUNTIME_data_flush( sequence, A(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); - } + chameleon_pzgeqrf_param_step( genD, ChamLower, k, ib, qrtree, tiles, + A, TS, TT, D, &options, sequence ); /* Restore the original location of the tiles */ for (n = k; n < A->nt; n++) { @@ -195,10 +236,10 @@ void chameleon_pzgeqrf_param( int genD, int K, A->get_rankof( A, k, n ) ); } - RUNTIME_iteration_pop(chamctxt); + RUNTIME_iteration_pop( chamctxt ); } - free(tiles); - RUNTIME_options_ws_free(&options); - RUNTIME_options_finalize(&options, chamctxt); + free( tiles ); + RUNTIME_options_ws_free( &options ); + RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index e9046ab30581d6f38e35f9514a9070fa36c89802..5ffb77ab7e61ac7dbbfdd80222bf245581f2d744 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -28,6 +28,131 @@ /** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling + * + * @param[in] genD + * Indicate if the copies of the A tiles must be done to speedup + * computations in updates. + * + * @param[in] uplo + * Indicate which kind of factorization has been performed on A to apply + * the respective Q generation. + * - ChamLower: Apply Classic QR factorization of the matrix A + * - ChamUpper: Apply the factorization of the upper part from a TT kernel. + * - ChamUpperLower: Apply the factorization of the full tile from a TS kernel. + */ +void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib, + const libhqr_tree_t *qrtree, int nbtiles, int *tiles, + CHAM_desc_t *A, CHAM_desc_t *Q, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_option_t *options, RUNTIME_sequence_t *sequence ) +{ + CHAM_desc_t *T; + int m, n, i, p, L; + int tempmm, tempnn, tempkmin, tempkn; + int nbgeqrt, node; + + tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb; + + for (i = nbtiles-1; i >= 0; i--) { + m = tiles[i]; + p = qrtree->currpiv( qrtree, k, m ); + + tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; + + if( qrtree->gettype( qrtree, k, m ) == LIBHQR_KILLED_BY_TS ) { + /* TS kernel */ + T = TS; + L = 0; + + /* Force TT kernel if this is the last diagonal tile */ + if ( (uplo == ChamUpper) && (m == k) ) { + L = tempmm; + } + } + else { + /* TT kernel */ + T = TT; + L = tempmm; + } + + for (n = k; n < Q->nt; n++) { + tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; + + node = Q->get_rankof( Q, m, n ); + RUNTIME_data_migrate( sequence, Q(p, n), node ); + RUNTIME_data_migrate( sequence, Q(m, n), node ); + + INSERT_TASK_ztpmqrt( + options, + ChamLeft, ChamNoTrans, + tempmm, tempnn, tempkn, L, ib, T->nb, + A(m, k), + T(m, k), + Q(p, n), + Q(m, n)); + } + RUNTIME_data_flush( sequence, A(m, k) ); + RUNTIME_data_flush( sequence, T(m, k) ); + } + + T = TS; + + /* The number of geqrt to apply */ + nbgeqrt = qrtree->getnbgeqrf( qrtree, k ); + for (i = 0; i < nbgeqrt; i++) { + m = qrtree->getm( qrtree, k, i ); + + /* We skip the QR factorization if this is the last diagonal tile */ + if ( (uplo == ChamUpper) && (m == k) ) { + continue; + } + + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + tempkmin = chameleon_min( tempmm, tempkn ); + + if ( genD ) { + int tempDmm = m == D->mt-1 ? D->m - m * D->mb : D->mb; + INSERT_TASK_zlacpy( + options, + ChamLower, tempDmm, tempkmin, A->nb, + A(m, k), + D(m, k) ); +#if defined(CHAMELEON_USE_CUDA) + INSERT_TASK_zlaset( + options, + ChamUpper, tempDmm, tempkmin, + 0., 1., + D(m, k) ); +#endif + } + + for (n = k; n < Q->nt; n++) { + tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; + + /* Restore the original location of the tiles */ + RUNTIME_data_migrate( sequence, Q(m, n), + Q->get_rankof( Q, m, n ) ); + + INSERT_TASK_zunmqr( + options, + ChamLeft, ChamNoTrans, + tempmm, tempnn, tempkmin, ib, T->nb, + D(m, k), + T(m, k), + Q(m, n)); + } + RUNTIME_data_flush( sequence, D(m, k) ); + RUNTIME_data_flush( sequence, T(m, k) ); + } +} + +/** + * Parallel construction of Q using tile V (application to identity) - dynamic scheduling + * + * @param[in] genD + * Indicate if the copies of the A tiles must be done to speedup + * computations in updates. genD is considered only if D is not NULL. + * */ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree, @@ -37,13 +162,9 @@ void chameleon_pzungqr_param( int genD, int K, { CHAM_context_t *chamctxt; RUNTIME_option_t options; - CHAM_desc_t *T; size_t ws_worker = 0; size_t ws_host = 0; - - int k, m, n, i, p, L; - int tempmm, tempnn, tempkmin, tempkn; - int ib, nbgeqrt, node, nbtiles, *tiles; + int k, ib, nbtiles, *tiles; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -53,7 +174,7 @@ void chameleon_pzungqr_param( int genD, int K, ib = CHAMELEON_IB; - if (D == NULL) { + if ( D == NULL ) { D = A; genD = 0; } @@ -83,92 +204,13 @@ void chameleon_pzungqr_param( int genD, int K, for (k = K-1; k >=0; k--) { RUNTIME_iteration_push(chamctxt, k); - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - /* Setting the order of tiles */ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); - for (i = nbtiles-1; i >= 0; i--) { - m = tiles[i]; - p = qrtree->currpiv(qrtree, k, m); - - tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - - if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { - /* TS kernel */ - T = TS; - L = 0; - } - else { - /* TT kernel */ - T = TT; - L = tempmm; - } - - for (n = k; n < Q->nt; n++) { - tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - - node = Q->get_rankof( Q, m, n ); - RUNTIME_data_migrate( sequence, Q(p, n), node ); - RUNTIME_data_migrate( sequence, Q(m, n), node ); - - INSERT_TASK_ztpmqrt( - &options, - ChamLeft, ChamNoTrans, - tempmm, tempnn, tempkn, L, ib, T->nb, - A(m, k), - T(m, k), - Q(p, n), - Q(m, n)); - } - RUNTIME_data_flush( sequence, A(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); - } - - T = TS; - - /* The number of geqrt to apply */ - nbgeqrt = qrtree->getnbgeqrf(qrtree, k); - for (i = 0; i < nbgeqrt; i++) { - m = qrtree->getm(qrtree, k, i); - - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkmin = chameleon_min(tempmm, tempkn); - - if ( genD ) { - int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; - INSERT_TASK_zlacpy( - &options, - ChamLower, tempDmm, tempkmin, A->nb, - A(m, k), - D(m, k) ); -#if defined(CHAMELEON_USE_CUDA) - INSERT_TASK_zlaset( - &options, - ChamUpper, tempDmm, tempkmin, - 0., 1., - D(m, k) ); -#endif - } - - for (n = k; n < Q->nt; n++) { - tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; - - /* Restore the original location of the tiles */ - RUNTIME_data_migrate( sequence, Q(m, n), - Q->get_rankof( Q, m, n ) ); - - INSERT_TASK_zunmqr( - &options, - ChamLeft, ChamNoTrans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), - T(m, k), - Q(m, n)); - } - RUNTIME_data_flush( sequence, D(m, k) ); - RUNTIME_data_flush( sequence, T(m, k) ); - } + chameleon_pzungqr_param_step( genD, ChamLower, k, ib, + qrtree, nbtiles, tiles, + A, Q, TS, TT, D, + &options, sequence ); RUNTIME_iteration_pop(chamctxt); } diff --git a/control/compute_z.h b/control/compute_z.h index 82d3e99ca5858f37f507a32e442ac2504b4c3ae5..70e94325e4ad00cc0e7bdb826cd83bce2d458fe9 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -99,6 +99,15 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, CHAM_des void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzbuild( cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, + const libhqr_tree_t *qrtree, int *tiles, + CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_option_t *options, RUNTIME_sequence_t *sequence ); +void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib, + const libhqr_tree_t *qrtree, int nbtiles, int *tiles, + CHAM_desc_t *A, CHAM_desc_t *Q, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_option_t *options, RUNTIME_sequence_t *sequence ); void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrf_param( int genD, int K, const libhqr_tree_t *qrtree,