tpqrt/tpgqrt: Add parallel algorithms using HQR reduction trees

e6909434 · Mathieu Faverge · 9805d22b · e6909434 · e6909434 · e6909434
Commit e6909434 authored 4 years ago by Mathieu Faverge
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -141,7 +141,9 @@ set(ZSRC
    pzunmqr_param.c
    pzunmqrrh.c
    pztpgqrt.c
+    pztpgqrt_param.c
    pztpqrt.c
+    pztpqrt_param.c
    ###
    zgels.c
    zgels_param.c

--- a/compute/pztpgqrt_param.c
+++ b/compute/pztpgqrt_param.c
+/**
+ *
+ * @file pztpgqrt_param.c
+ *
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ * @copyright 2016-2020 KAUST. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon computational routines
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ */
+#include "control/common.h"
+#include <stdlib.h>
+
+#define QTop(m,n) QTop, m, n
+
+#define A(m,n) A, m, n
+#define Q(m,n) Q, m, n
+#define T(m,n) T, m, n
+#define D(m,n) D, m, n
+
+/**
+ *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if the copies of the A tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
+ * @param[in] uplo
+ *         - ChamUpper: This corresponds to the TTQRT factorization kernel. Only the upper
+ *           trapezoidal part of Q is referenced.
+ *         - ChamLower, or ChamUpperLower: This corresponds to the TSQRT factorization
+ *           kernel. The full Q is referenced.
+ */
+void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int K,
+                               const libhqr_tree_t *qrtree,
+                               CHAM_desc_t *A, CHAM_desc_t *QTop, CHAM_desc_t *Q,
+                               CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+                               RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+    size_t ws_worker = 0;
+    size_t ws_host = 0;
+
+    int k, n, p;
+    int ib, node, nbtiles, *tiles;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS) {
+        return;
+    }
+    RUNTIME_options_init(&options, chamctxt, sequence, request);
+
+    ib = CHAMELEON_IB;
+
+    if ( D == NULL ) {
+        D    = A;
+        genD = 0;
+    }
+
+    /*
+     * zunmqr  = A->nb * ib
+     * ztpmqrt = A->nb * ib
+     */
+    ws_worker = A->nb * ib;
+
+    /* Allocation of temporary (scratch) working space */
+#if defined(CHAMELEON_USE_CUDA)
+    /*
+     * ztpmqrt = 3 * A->nb * ib
+     */
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
+#endif
+
+    ws_worker *= sizeof(CHAMELEON_Complex64_t);
+    ws_host   *= sizeof(CHAMELEON_Complex64_t);
+
+    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
+
+    /* Initialisation of temporary tiles array */
+    tiles = (int*)calloc(qrtree->mt, sizeof(int));
+
+    for (k = K-1; k >=0; k--) {
+        RUNTIME_iteration_push(chamctxt, k);
+
+        /* Setting the order of tiles */
+        nbtiles = libhqr_walk_stepk( qrtree, k, tiles );
+        p = tiles[nbtiles];
+
+        /* Combine Bottom and Top matrices by merging last pivot with ATop(k,*) */
+        {
+            CHAM_desc_t *T = TT;
+            int temppm = p == Q->mt-1 ? Q->m - p * Q->mb : Q->mb;
+            int tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb;
+            int tempnn;
+            int L = temppm;
+
+            for (n = k; n < Q->nt; n++) {
+                tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
+
+                node = Q->get_rankof( Q, p, n );
+                RUNTIME_data_migrate( sequence, QTop(k, n), node );
+                RUNTIME_data_migrate( sequence, Q(   p, n), node );
+
+                INSERT_TASK_ztpmqrt(
+                    &options,
+                    ChamLeft, ChamNoTrans,
+                    temppm, tempnn, tempkn, L, ib, T->nb,
+                    A(   p, k),
+                    T(   p, k),
+                    QTop(k, n),
+                    Q(   p, n));
+            }
+
+            RUNTIME_data_flush( sequence, A(p, k) );
+            RUNTIME_data_flush( sequence, T(p, k) );
+        }
+
+        chameleon_pzungqr_param_step( genD, uplo, k, ib,
+                                      qrtree, nbtiles, tiles,
+                                      A, Q, TS, TT, D,
+                                      &options, sequence );
+
+        RUNTIME_iteration_pop(chamctxt);
+    }
+
+    free(tiles);
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chamctxt);
+}
--- a/compute/pztpqrt_param.c
+++ b/compute/pztpqrt_param.c
+/**
+ *
+ * @file pztpqrt_param.c
+ *
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ * @copyright 2016-2020 KAUST. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon computational routines
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ */
+#include "control/common.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "libhqr.h"
+
+#define ATop(m,n) ATop, (m), (n)
+#define A(m,n) A, (m), (n)
+#define T(m,n) T, (m), (n)
+#define D(m,n) D, (m), (n)
+
+/**
+ *  Parallel tile QR matrix reduction - Equivalent to tpqrt kernel for matrices.
+ *
+ * @param[in] genD
+ *         Indicate if copies of the geqrt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
+ * @param[in] uplo
+ *         - ChamUpper: This corresponds to the former TTQRT kernel. Only the upper
+ *           trapezoidal part of A is factorized.
+ *         - ChamLower, or ChamUpperLower: This corresponds to the former TSQRT
+ *           kernel. The full A is factorized.
+ */
+void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K,
+                              const libhqr_tree_t *qrtree, CHAM_desc_t *ATop, CHAM_desc_t *A,
+                              CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+                              RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+    size_t ws_worker = 0;
+    size_t ws_host = 0;
+
+    int k, n, p;
+    int ib, *tiles;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS) {
+        return;
+    }
+    RUNTIME_options_init(&options, chamctxt, sequence, request);
+
+    ib = CHAMELEON_IB;
+
+    if ( (genD == 0) || (D == NULL) ) {
+        D    = A;
+        genD = 0;
+    }
+
+    /*
+     * zgeqrt  = A->nb * (ib+1)
+     * zunmqr  = A->nb * ib
+     * ztpqrt  = A->nb * (ib+1)
+     * ztpmqrt = A->nb * ib
+     */
+    ws_worker = A->nb * (ib+1);
+
+    /* Allocation of temporary (scratch) working space */
+#if defined(CHAMELEON_USE_CUDA)
+    /*
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
+     */
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
+#endif
+
+    ws_worker *= sizeof(CHAMELEON_Complex64_t);
+    ws_host   *= sizeof(CHAMELEON_Complex64_t);
+
+    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
+
+    /* Initialisation of temporary tiles array */
+    tiles = (int*)calloc(qrtree->mt, sizeof(int));
+
+    for (k = 0; k < K; k++) {
+        RUNTIME_iteration_push(chamctxt, k);
+
+        p = chameleon_pzgeqrf_param_step( genD, uplo, k, ib, qrtree, tiles,
+                                          A, TS, TT, D, &options, sequence );
+
+        /* Combine with ATop and A by merging last pivot with A(k,k) */
+        {
+            CHAM_desc_t *T;
+            int temppm = p == ATop->mt-1 ? ATop->m - p * ATop->mb : ATop->mb;
+            int tempkn = k == ATop->nt-1 ? ATop->n - k * ATop->nb : ATop->nb;
+            int L, node, tempnn;
+
+            T = TT;
+            L = temppm;
+
+            node = A->get_rankof( A, p, k );
+            RUNTIME_data_migrate( sequence, ATop(k, k), node );
+            RUNTIME_data_migrate( sequence, A(   p, k), node );
+
+            INSERT_TASK_ztpqrt(
+                &options,
+                temppm, tempkn, chameleon_min(L, tempkn), ib, T->nb,
+                ATop(k, k),
+                A(p, k),
+                T(p, k));
+
+            for (n = k+1; n < A->nt; n++) {
+                tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+
+                node = A->get_rankof( A, p, n );
+                RUNTIME_data_migrate( sequence, ATop(k, n), node );
+                RUNTIME_data_migrate( sequence, A(   p, n), node );
+
+                INSERT_TASK_ztpmqrt(
+                    &options,
+                    ChamLeft, ChamConjTrans,
+                    temppm, tempnn, A->nb, L, ib, T->nb,
+                    A(p, k),
+                    T(p, k),
+                    ATop(k, n),
+                    A(p, n));
+            }
+
+            RUNTIME_data_flush( sequence, A(p, k) );
+            RUNTIME_data_flush( sequence, T(p, k) );
+        }
+
+        /* Restore the original location of the tiles */
+        for (n = k; n < ATop->nt; n++) {
+            RUNTIME_data_migrate( sequence, ATop(k, n),
+                                  ATop->get_rankof( ATop, k, n ) );
+        }
+
+        RUNTIME_iteration_pop(chamctxt);
+    }
+
+    free(tiles);
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chamctxt);
+}
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -126,6 +126,15 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
                              CHAM_desc_t *A, CHAM_desc_t *Q,
                              CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
                              RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int kt, const libhqr_tree_t *qrtree,
+                               CHAM_desc_t *V2, CHAM_desc_t *Q1, CHAM_desc_t *Q2,
+                               CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *DD,
+                               RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
+void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K,
+                              const libhqr_tree_t *qrtree,
+                              CHAM_desc_t *ATop, CHAM_desc_t *A,
+                              CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+                              RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );

 /**
 * Gram function prototypes