From 141d0f6e4aa5bdad67e5df24c3e351ab45d08495 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 5 Sep 2023 10:50:50 +0200
Subject: [PATCH] zgetrf: Add blocked version algorithm of the getrf

---
 compute/pzgetrf.c   | 70 +++++++++++++++++++++++++++++++++++++++++++--
 compute/zgetrf.c    | 29 ++++++++++++++++---
 control/compute_z.h |  3 +-
 3 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 94dee2b5b..ccd05fbcc 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -16,7 +16,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Matthieu Kuhn
- * @date 2023-08-31
+ * @date 2023-09-08
  * @precisions normal z -> s d c
  *
  */
@@ -24,6 +24,7 @@
 
 #define A(m,n)  A,        m, n
 #define U(m,n)  &(ws->U), m, n
+#define Up(m,n)  &(ws->Up), m, n
 
 /*
  * All the functions below are panel factorization variant.
@@ -158,6 +159,71 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
     RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
 }
 
+static inline void
+chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
+                                       CHAM_desc_t                *A,
+                                       CHAM_ipiv_t                *ipiv,
+                                       int                         k,
+                                       RUNTIME_option_t           *options )
+{
+    int m, h, b, nbblock;
+    int tempkm, tempkn, minmn;
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    minmn  = chameleon_min( tempkm, tempkn );
+
+    /* Update the number of column */
+    ipiv->n = minmn;
+    nbblock = chameleon_ceil( minmn, ws->ib );
+
+    /*
+     * Algorithm per column with pivoting
+     */
+    for (b=0; b<nbblock; b++){
+        int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib;
+
+        for (h=0; h<hmax; h++){
+            int j =  h + b * ws->ib;
+
+            INSERT_TASK_zgetrf_blocked_diag(
+                options,
+                j, k * A->mb, ws->ib,
+                A(k, k), Up(k, k),
+                ipiv );
+
+            for (m = k+1; m < A->mt; m++) {
+
+                INSERT_TASK_zgetrf_blocked_offdiag(
+                    options,
+                    j, m * A->mb, ws->ib,
+                    A(m, k), Up(k, k),
+                    ipiv );
+            }
+
+
+            if ( (b < (nbblock-1)) && (h == hmax-1) ) {
+                INSERT_TASK_zgetrf_blocked_trsm(
+                    options,
+                    ws->ib, tempkn, b * ws->ib + hmax, ws->ib,
+                    Up(k, k),
+                    ipiv );
+            }
+
+            assert( j<= minmn );
+            if ( j < minmn ) {
+                /* Reduce globally (between MPI processes) */
+                RUNTIME_ipiv_reducek( options, ipiv, k, j );
+            }
+        }
+    }
+    RUNTIME_data_flush( options->sequence, Up(k, k) );
+
+    /* Flush temporary data used for the pivoting */
+    INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+}
+
 static inline void
 chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
                                CHAM_desc_t                *A,
@@ -176,7 +242,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPiv:
-        chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options );
+        chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options );
         break;
 
     case ChamGetrfNoPiv:
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 38ef81719..845b127b1 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -19,7 +19,7 @@
  * @author Florent Pruvost
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-08-31
+ * @date 2023-09-08
  *
  * @precisions normal z -> s d c
  *
@@ -88,6 +88,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         chameleon_cleanenv( algostr );
     }
 
+    /* Allocation of U for permutation of the panels */
     if ( ws->alg == ChamGetrfNoPivPerColumn ) {
         chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE,
                              ChamComplexDouble, 1, A->nb, A->nb,
@@ -112,6 +113,17 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         ws->ib = 1;
     }
 
+    /* Allocation of Up for the permutation of the diagonal panel per block */
+    if ( ws->alg == ChamGetrfPPiv ) {
+        /* TODO: Should be restricted to diagonal tiles */
+        /* Possibly to a single handle with a permutation of the ownership */
+        chameleon_desc_init( &(ws->Up), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, ws->ib, A->nb, ws->ib * A->nb,
+                             A->mt * ws->ib, A->nt * A->nb, 0, 0,
+                             A->mt * ws->ib, A->nt * A->nb, A->p, A->q,
+                             NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg );
+    }
+
     return ws;
 }
 
@@ -145,6 +157,9 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws )
     {
         chameleon_desc_destroy( &(ws->U) );
     }
+    if ( ws->alg == ChamGetrfPPiv ) {
+        chameleon_desc_destroy( &(ws->Up) );
+    }
     free( ws );
 }
 
@@ -249,7 +264,9 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV )
     /* Allocate workspace for partial pivoting */
     ws = CHAMELEON_zgetrf_WS_Alloc( &descAt );
 
-    if ( ws->alg == ChamGetrfPPivPerColumn ) {
+    if ( ( ws->alg == ChamGetrfPPivPerColumn ) ||
+         ( ws->alg == ChamGetrfPPiv ) )
+    {
         chameleon_ipiv_init( &descIPIV, &descAt, IPIV );
     }
 
@@ -260,13 +277,17 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV )
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
                          ChamDescInout, ChamUpperLower, sequence, &request );
 
-    if ( ws->alg == ChamGetrfPPivPerColumn ) {
+    if ( ( ws->alg == ChamGetrfPPivPerColumn ) ||
+         ( ws->alg == ChamGetrfPPiv ) )
+    {
         RUNTIME_ipiv_gather( &descIPIV, IPIV, 0 );
     }
     chameleon_sequence_wait( chamctxt, sequence );
 
     /* Cleanup the temporary data */
-    if ( ws->alg == ChamGetrfPPivPerColumn ) {
+    if ( ( ws->alg == ChamGetrfPPivPerColumn ) ||
+         ( ws->alg == ChamGetrfPPiv ) )
+    {
         chameleon_ipiv_destroy( &descIPIV );
     }
     CHAMELEON_zgetrf_WS_Free( ws );
diff --git a/control/compute_z.h b/control/compute_z.h
index 9032c20f2..22b9f06f0 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -22,7 +22,7 @@
  * @author Alycia Lisito
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-08-22
+ * @date 2023-09-08
  * @precisions normal z -> c d s
  *
  */
@@ -45,6 +45,7 @@ struct chameleon_pzgetrf_s {
     cham_getrf_t alg;
     int          ib; /* Internal blocking parameter */
     CHAM_desc_t  U;
+    CHAM_desc_t  Up;
 };
 
 /**
-- 
GitLab