diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 9f2c8ab6f22fddcca9a9eb7ecba132dc32a23e50..9b0e390599f9bb019f7227652d853e735f1a8e2e 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -141,7 +141,9 @@ set(ZSRC pzunmqr_param.c pzunmqrrh.c pztpgqrt.c + pztpgqrt_param.c pztpqrt.c + pztpqrt_param.c ### zgels.c zgels_param.c diff --git a/compute/pztpgqrt_param.c b/compute/pztpgqrt_param.c new file mode 100644 index 0000000000000000000000000000000000000000..31f749823fcdecf33fffc58b500da44a86b1522b --- /dev/null +++ b/compute/pztpgqrt_param.c @@ -0,0 +1,138 @@ +/** + * + * @file pztpgqrt_param.c + * + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon computational routines + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" +#include <stdlib.h> + +#define QTop(m,n) QTop, m, n + +#define A(m,n) A, m, n +#define Q(m,n) Q, m, n +#define T(m,n) T, m, n +#define D(m,n) D, m, n + +/** + * Parallel construction of Q using tile V (application to identity) - dynamic scheduling + * + * @param[in] genD + * Indicate if the copies of the A tiles must be done to speedup + * computations in updates. genD is considered only if D is not NULL. + * + * @param[in] uplo + * - ChamUpper: This corresponds to the TTQRT factorization kernel. Only the upper + * trapezoidal part of Q is referenced. + * - ChamLower, or ChamUpperLower: This corresponds to the TSQRT factorization + * kernel. The full Q is referenced. + */ +void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int K, + const libhqr_tree_t *qrtree, + CHAM_desc_t *A, CHAM_desc_t *QTop, CHAM_desc_t *Q, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + size_t ws_worker = 0; + size_t ws_host = 0; + + int k, n, p; + int ib, node, nbtiles, *tiles; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + ib = CHAMELEON_IB; + + if ( D == NULL ) { + D = A; + genD = 0; + } + + /* + * zunmqr = A->nb * ib + * ztpmqrt = A->nb * ib + */ + ws_worker = A->nb * ib; + + /* Allocation of temporary (scratch) working space */ +#if defined(CHAMELEON_USE_CUDA) + /* + * ztpmqrt = 3 * A->nb * ib + */ + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); +#endif + + ws_worker *= sizeof(CHAMELEON_Complex64_t); + ws_host *= sizeof(CHAMELEON_Complex64_t); + + RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + + /* Initialisation of temporary tiles array */ + tiles = (int*)calloc(qrtree->mt, sizeof(int)); + + for (k = K-1; k >=0; k--) { + RUNTIME_iteration_push(chamctxt, k); + + /* Setting the order of tiles */ + nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); + p = tiles[nbtiles]; + + /* Combine Bottom and Top matrices by merging last pivot with ATop(k,*) */ + { + CHAM_desc_t *T = TT; + int temppm = p == Q->mt-1 ? Q->m - p * Q->mb : Q->mb; + int tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb; + int tempnn; + int L = temppm; + + for (n = k; n < Q->nt; n++) { + tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; + + node = Q->get_rankof( Q, p, n ); + RUNTIME_data_migrate( sequence, QTop(k, n), node ); + RUNTIME_data_migrate( sequence, Q( p, n), node ); + + INSERT_TASK_ztpmqrt( + &options, + ChamLeft, ChamNoTrans, + temppm, tempnn, tempkn, L, ib, T->nb, + A( p, k), + T( p, k), + QTop(k, n), + Q( p, n)); + } + + RUNTIME_data_flush( sequence, A(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); + } + + chameleon_pzungqr_param_step( genD, uplo, k, ib, + qrtree, nbtiles, tiles, + A, Q, TS, TT, D, + &options, sequence ); + + RUNTIME_iteration_pop(chamctxt); + } + + free(tiles); + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/pztpqrt_param.c b/compute/pztpqrt_param.c new file mode 100644 index 0000000000000000000000000000000000000000..e537587416f27753ffda1d0ceac4a75a4f837dcb --- /dev/null +++ b/compute/pztpqrt_param.c @@ -0,0 +1,153 @@ +/** + * + * @file pztpqrt_param.c + * + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon computational routines + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" +#include <stdlib.h> +#include <stdio.h> +#include "libhqr.h" + +#define ATop(m,n) ATop, (m), (n) +#define A(m,n) A, (m), (n) +#define T(m,n) T, (m), (n) +#define D(m,n) D, (m), (n) + +/** + * Parallel tile QR matrix reduction - Equivalent to tpqrt kernel for matrices. + * + * @param[in] genD + * Indicate if copies of the geqrt tiles must be done to speedup + * computations in updates. genD is considered only if D is not NULL. + * + * @param[in] uplo + * - ChamUpper: This corresponds to the former TTQRT kernel. Only the upper + * trapezoidal part of A is factorized. + * - ChamLower, or ChamUpperLower: This corresponds to the former TSQRT + * kernel. The full A is factorized. + */ +void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K, + const libhqr_tree_t *qrtree, CHAM_desc_t *ATop, CHAM_desc_t *A, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + size_t ws_worker = 0; + size_t ws_host = 0; + + int k, n, p; + int ib, *tiles; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + ib = CHAMELEON_IB; + + if ( (genD == 0) || (D == NULL) ) { + D = A; + genD = 0; + } + + /* + * zgeqrt = A->nb * (ib+1) + * zunmqr = A->nb * ib + * ztpqrt = A->nb * (ib+1) + * ztpmqrt = A->nb * ib + */ + ws_worker = A->nb * (ib+1); + + /* Allocation of temporary (scratch) working space */ +#if defined(CHAMELEON_USE_CUDA) + /* + * zunmqr = A->nb * ib + * ztpmqrt = 3 * A->nb * ib + */ + ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 ); +#endif + + ws_worker *= sizeof(CHAMELEON_Complex64_t); + ws_host *= sizeof(CHAMELEON_Complex64_t); + + RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + + /* Initialisation of temporary tiles array */ + tiles = (int*)calloc(qrtree->mt, sizeof(int)); + + for (k = 0; k < K; k++) { + RUNTIME_iteration_push(chamctxt, k); + + p = chameleon_pzgeqrf_param_step( genD, uplo, k, ib, qrtree, tiles, + A, TS, TT, D, &options, sequence ); + + /* Combine with ATop and A by merging last pivot with A(k,k) */ + { + CHAM_desc_t *T; + int temppm = p == ATop->mt-1 ? ATop->m - p * ATop->mb : ATop->mb; + int tempkn = k == ATop->nt-1 ? ATop->n - k * ATop->nb : ATop->nb; + int L, node, tempnn; + + T = TT; + L = temppm; + + node = A->get_rankof( A, p, k ); + RUNTIME_data_migrate( sequence, ATop(k, k), node ); + RUNTIME_data_migrate( sequence, A( p, k), node ); + + INSERT_TASK_ztpqrt( + &options, + temppm, tempkn, chameleon_min(L, tempkn), ib, T->nb, + ATop(k, k), + A(p, k), + T(p, k)); + + for (n = k+1; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + + node = A->get_rankof( A, p, n ); + RUNTIME_data_migrate( sequence, ATop(k, n), node ); + RUNTIME_data_migrate( sequence, A( p, n), node ); + + INSERT_TASK_ztpmqrt( + &options, + ChamLeft, ChamConjTrans, + temppm, tempnn, A->nb, L, ib, T->nb, + A(p, k), + T(p, k), + ATop(k, n), + A(p, n)); + } + + RUNTIME_data_flush( sequence, A(p, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); + } + + /* Restore the original location of the tiles */ + for (n = k; n < ATop->nt; n++) { + RUNTIME_data_migrate( sequence, ATop(k, n), + ATop->get_rankof( ATop, k, n ) ); + } + + RUNTIME_iteration_pop(chamctxt); + } + + free(tiles); + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/control/compute_z.h b/control/compute_z.h index 70e94325e4ad00cc0e7bdb826cd83bce2d458fe9..cd08503c32d68f78d5e09ad001c84f70772f9630 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -126,6 +126,15 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int kt, const libhqr_tree_t *qrtree, + CHAM_desc_t *V2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *DD, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K, + const libhqr_tree_t *qrtree, + CHAM_desc_t *ATop, CHAM_desc_t *A, + CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); /** * Gram function prototypes