From 945e4dec3888f7dcaccf6ac992e7df69eab3a572 Mon Sep 17 00:00:00 2001
From: Ana Hourcau <ana.hourcau@inria.fr>
Date: Wed, 15 May 2024 15:51:21 +0200
Subject: [PATCH] compute: Adding hered algorithms for reduction on hermitian
 matrices

---
 compute/CMakeLists.txt          |   5 +-
 compute/pzhered.c               | 288 ++++++++++++++++++++++++++++++++
 compute/zhered.c                | 173 +++++++++++++++++++
 control/compute_z.h             |   5 +-
 include/chameleon/chameleon_z.h |   2 +
 5 files changed, 471 insertions(+), 2 deletions(-)
 create mode 100644 compute/pzhered.c
 create mode 100644 compute/zhered.c

diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 6dfa83891..cc25fb7d2 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -27,7 +27,8 @@
 #  @author Alycia Lisito
 #  @author Loris Lucido
 #  @author Matthieu Kuhn
-#  @date 2024-04-03
+#  @author Ana Hourcau
+#  @date 2024-07-17
 #
 ###
 
@@ -193,10 +194,12 @@ set(ZSRC
     ##################
     # MIXED PRECISION
     ##################
+    pzhered.c
     pzlag2c.c
     pzgered.c
     pzgerst.c
     ###
+    zhered.c
     zgered.c
     zgerst.c
     #zcgels.c
diff --git a/compute/pzhered.c b/compute/pzhered.c
new file mode 100644
index 000000000..97e171b3f
--- /dev/null
+++ b/compute/pzhered.c
@@ -0,0 +1,288 @@
+/**
+ *
+ * @file pzhered.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zhered parallel algorithm
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Ana Hourcau
+ * @date 2024-07-17
+ * @precisions normal z -> z d
+ *
+ */
+// ALLOC_WS :  A->mb
+// ALLOC_WS :  A->nb
+// WS_ADD :  A->mb + A->nb
+#include "control/common.h"
+#include <coreblas/lapacke.h>
+
+#define A(m, n) A, (m), (n)
+#define W(desc, m, n) (desc), (m), (n)
+
+static inline void
+chameleon_pzhered_frb( cham_trans_t trans, cham_uplo_t uplo,
+                       CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt,
+                       RUNTIME_option_t *options )
+{
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    int m, n;
+    int MT = A->mt;
+    int NT = A->nt;
+    int M  = A->m;
+    int N  = A->n;
+    int P  = Welt->p;
+    int Q  = Welt->q;
+
+    /* Initialize workspaces for tile norms */
+    for (m = 0; m < Wnorm->mt; m++)
+    {
+        for (n = 0; n < NT; n++)
+        {
+            INSERT_TASK_dlaset(
+                options,
+                ChamUpperLower, Wnorm->mb, Wnorm->nb,
+                alpha, beta,
+                W(Wnorm, m, n));
+        }
+    }
+
+    /* Initialize workspaces */
+    for (m = 0; m < Welt->mt; m++)
+    {
+        for (n = 0; n < Welt->nt; n++)
+        {
+            INSERT_TASK_dlaset(
+                options,
+                ChamUpperLower, Welt->mb, Welt->nb,
+                alpha, beta,
+                W(Welt, m, n));
+        }
+    }
+
+    /**
+     * Step 1:
+     *  For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) )
+     */
+    for (m = 0; m < MT; m++)
+    {
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
+
+        int tempmm = (m == (MT - 1)) ? M - m * A->mb : A->mb;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            int tempnn = (n == (NT - 1)) ? N - n * A->nb : A->nb;
+
+            if (n == m)
+            {
+                if ( trans == ChamConjTrans ) {
+                    INSERT_TASK_zhessq(
+                        options, ChamEltwise, uplo, tempmm,
+                        A(m, n), W( Wnorm, m, n) );
+                }
+                else {
+                    INSERT_TASK_zsyssq(
+                        options, ChamEltwise, uplo, tempmm,
+                        A(m, n), W( Wnorm, m, n) );
+                }
+            }
+            else
+            {
+                INSERT_TASK_zgessq(
+                    options, ChamEltwise, tempmm, tempnn,
+                    A(m, n), W( Wnorm, m, n ));
+                INSERT_TASK_zgessq(
+                    options, ChamEltwise, tempmm, tempnn,
+                    A(m, n), W( Wnorm, n, m ));
+            }
+        }
+    }
+
+    for(m = 0; m < MT; m++) {
+        for(n = Q; n < NT; n++) {
+            INSERT_TASK_dplssq(
+                options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) );
+        }
+
+        /**
+         * Step 2:
+         *  For each j, W(m, j) = reduce( W( Welt, m, 0..Q-1) )
+         */
+        for(n = 1; n < Q; n++) {
+            INSERT_TASK_dplssq(
+                options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
+        }
+    }
+
+    /**
+     * Step 3:
+     *  For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) )
+     */
+    for(m = P; m < MT; m++) {
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
+    }
+
+    /**
+     * Step 4:
+     *  For each i, Welt(i, n) = max( Welt(0..P-1, n) )
+     */
+    for(m = 1; m < P; m++) {
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
+    }
+
+    /* Compute the norm of each tile, and the full norm */
+    for (m = 0; m < MT; m++)
+    {
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            /* Compute the final norm of the tile */
+            INSERT_TASK_dplssq2(
+                options, 1, W( Wnorm, m, n ) );
+        }
+    }
+    INSERT_TASK_dplssq2(
+        options, 1, W( Welt, 0, 0) );
+
+    /**
+     * Broadcast the result
+     */
+    for (m = 0; m < A->p; m++)
+    {
+        for (n = 0; n < A->q; n++)
+        {
+            if ((m != 0) || (n != 0))
+            {
+                INSERT_TASK_dlacpy(
+                    options,
+                    ChamUpperLower, 1, 1,
+                    W(Welt, 0, 0), W(Welt, m, n));
+            }
+        }
+    }
+}
+
+/**
+ *
+ */
+void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
+                        RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+    CHAM_desc_t Wcol;
+    CHAM_desc_t Welt;
+    double gnorm, lnorm, threshold, eps;
+
+    int workmt, worknt;
+    int m, n;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS)
+    {
+        return;
+    }
+    RUNTIME_options_init(&options, chamctxt, sequence, request);
+
+    workmt = chameleon_max(A->mt, A->p);
+    worknt = chameleon_max(A->nt, A->q);
+
+    RUNTIME_options_ws_alloc(&options, 1, 0);
+
+    /* Matrix to store the norm of each element */
+    chameleon_desc_init(&Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
+                        A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q,
+                        NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg);
+
+    /* Matrix to compute the global frobenius norm */
+    chameleon_desc_init(&Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
+                        workmt * 2, worknt, 0, 0, workmt * 2, worknt, A->p, A->q,
+                        NULL, NULL, NULL, NULL);
+
+    chameleon_pzhered_frb( trans, uplo, A, &Wcol, &Welt, &options );
+
+    CHAMELEON_Desc_Flush(&Wcol, sequence);
+    CHAMELEON_Desc_Flush(&Welt, sequence);
+    CHAMELEON_Desc_Flush(A, sequence);
+
+    RUNTIME_sequence_wait(chamctxt, sequence);
+
+    gnorm = *((double *)Welt.get_blkaddr(&Welt, A->myrank / A->q, A->myrank % A->q));
+    chameleon_desc_destroy(&Welt);
+
+    /**
+     * Reduce the precision of the tiles if possible
+     */
+    if (prec < 0.)
+    {
+#if !defined(CHAMELEON_SIMULATION)
+        eps = LAPACKE_dlamch_work('e');
+#else
+#if defined(PRECISION_z) || defined(PRECISION_d)
+        eps = 1.e-15;
+#else
+        eps = 1.e-7;
+#endif
+#endif
+    }
+    else
+    {
+        eps = prec;
+    }
+    threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt));
+
+#if defined(CHAMELEON_DEBUG_GERED)
+    fprintf(stderr,
+            "[%2d] The norm of A is:           %e\n"
+            "[%2d] The requested precision is: %e\n"
+            "[%2d] The computed threshold is:  %e\n",
+            A->myrank, gnorm,
+            A->myrank, eps,
+            A->myrank, threshold);
+#endif
+    for (m = 0; m < A->mt; m++)
+    {
+        int tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, A->nt) : A->nt;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            CHAM_tile_t *tile = A->get_blktile(A, m, n);
+
+            int tempnn = (n == (A->nt - 1)) ? A->n - n * A->nb : A->nb;
+
+            /*
+                * u_{high} = 1e-16 (later should be application accuracy)
+                * u_{low} = 1e-8
+                * ||A_{i,j}||_F  < u_{high} * || A ||_F / (nt * u_{low})
+                * ||A_{i,j}||_F  < threshold / u_{low}
+                */
+
+            INSERT_TASK_zgered( &options, threshold,
+                                tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) );
+        }
+    }
+
+    CHAMELEON_Desc_Flush(A, sequence);
+    RUNTIME_sequence_wait(chamctxt, sequence);
+
+    chameleon_desc_destroy(&Wcol);
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chamctxt);
+}
diff --git a/compute/zhered.c b/compute/zhered.c
new file mode 100644
index 000000000..5603ea68b
--- /dev/null
+++ b/compute/zhered.c
@@ -0,0 +1,173 @@
+/**
+ *
+ * @file zhered.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zhered wrappers
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Ana Hourcau
+ * @date 2024-07-17
+ * @precisions normal z -> z d
+ *
+ */
+#include "control/common.h"
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile
+ *
+ * @brief Computes the Cholesky factorization of a symmetric positive definite
+ * or Hermitian positive definite matrix with mixed precision.
+ *
+ * This is the synchronous version of CHAMELEON_zheredinit_Tile_Async().  It
+ * operates on matrices stored by tiles with tiles of potentially different
+ * precisions.  All matrices are passed through descriptors.  All dimensions are
+ * taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ * @param[in] uplo
+ *          = ChamUpper: Upper triangle of A is stored;
+ *          = ChamLower: Lower triangle of A is stored.
+ *
+ * @param[in] A
+ *          On entry, the symmetric positive definite (or Hermitian) matrix A.
+ *          If uplo = ChamUpper, the leading N-by-N upper triangular part of A
+ *          contains the upper triangular part of the matrix A, and the strictly lower triangular
+ *          part of A is not referenced.
+ *          If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower
+ *          triangular part of the matrix A, and the strictly upper triangular part of A is not
+ *          referenced.
+ *          On exit, if return value = 0, the factor U or L from the Cholesky factorization
+ *          A = U^H*U or A = L*L^H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval >0 if i, the leading minor of order i of A is not positive definite, so the
+ *               factorization could not be completed, and the solution has not been computed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhered
+ * @sa CHAMELEON_zhered_Tile_Async
+ * @sa CHAMELEON_cpotrfmp_Tile
+ * @sa CHAMELEON_dpotrfmp_Tile
+ * @sa CHAMELEON_spotrfmp_Tile
+ * @sa CHAMELEON_zpotrs_Tile
+ *
+ */
+int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_sequence_t *sequence = NULL;
+    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    int status;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zheredinit_Tile", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    chameleon_sequence_create( chamctxt, &sequence );
+
+    CHAMELEON_zhered_Tile_Async( uplo, precision, A, sequence, &request );
+
+    CHAMELEON_Desc_Flush( A, sequence );
+
+    chameleon_sequence_wait( chamctxt, sequence );
+    status = sequence->status;
+    chameleon_sequence_destroy( chamctxt, sequence );
+    return status;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile_Async
+ *
+ * @brief Computes the Cholesky factorization of a symmetric positive definite
+ * or Hermitian positive definite matrix with mixed precision.
+ *
+ * This is the non-blocking equivalent of CHAMELEON_zhered_Tile().  It
+ * operates on matrices stored by tiles with tiles of potentially different
+ * precisions.  All matrices are passed through descriptors.  All dimensions are
+ * taken from the descriptors. It may return before the computation is
+ * finished. This function allows for pipelining operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhered
+ * @sa CHAMELEON_zhered_Tile
+ * @sa CHAMELEON_cpotrfmp_Tile_Async
+ * @sa CHAMELEON_dpotrfmp_Tile_Async
+ * @sa CHAMELEON_spotrfmp_Tile_Async
+ * @sa CHAMELEON_zpotrs_Tile_Async
+ *
+ */
+int CHAMELEON_zhered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A,
+                                 RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL sequence");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL request");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == CHAMELEON_SUCCESS) {
+        request->status = CHAMELEON_SUCCESS;
+    }
+    else {
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED);
+    }
+
+    /* Check descriptors for correctness */
+    if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
+        chameleon_error("CHAMELEON_zhered_Tile_Async", "invalid descriptor");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        chameleon_error("CHAMELEON_zhered_Tile_Async", "only square tiles supported");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+
+    /*
+     * Quick return
+     */
+    if ( chameleon_max( A->m, A->n ) == 0 ) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    chameleon_pzhered( ChamConjTrans, uplo, precision, A, sequence, request );
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/control/compute_z.h b/control/compute_z.h
index 645018f83..088e03140 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -22,7 +22,8 @@
  * @author Alycia Lisito
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-09-08
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> c d s
  *
  */
@@ -81,6 +82,8 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64
 #if defined(PRECISION_z) || defined(PRECISION_d)
 void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
+void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
+                        RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzgerst( cham_uplo_t uplo, CHAM_desc_t *A,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 #endif
diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h
index 75e214843..3f33260f4 100644
--- a/include/chameleon/chameleon_z.h
+++ b/include/chameleon/chameleon_z.h
@@ -170,6 +170,7 @@ int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B);
 int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
+int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
 int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A );
 int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
@@ -249,6 +250,7 @@ int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequen
 int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+int CHAMELEON_zhered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-- 
GitLab