From 0d58113d1dc30324b806f489c98be586aa5882c5 Mon Sep 17 00:00:00 2001
From: Matthieu KUHN <bkuhnm@l0.spartan.bench.local>
Date: Thu, 31 Mar 2022 14:46:46 +0200
Subject: [PATCH] getrf: Add a version with no pivoting per column

---
 compute/pzgetrf.c   | 88 ++++++++++++++++++++++++++++++++-------------
 compute/zgetrf.c    | 31 ++++++++++------
 control/compute_z.h | 12 +++++--
 3 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 7e6237c7e..967e83f2f 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -4,7 +4,7 @@
  *
  * @copyright 2009-2014 The University of Tennessee and The University of
  *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
  *                      Univ. Bordeaux. All rights reserved.
  *
  ***
@@ -16,13 +16,14 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Matthieu Kuhn
- * @date 2022-02-22
+ * @date 2023-02-21
  * @precisions normal z -> s d c
  *
  */
 #include "control/common.h"
 
 #define A(m,n) A,  m,  n
+#define U(m,n) &(ws->U),  m,  n
 
 /*
  * All the functions below are panel factorization variant.
@@ -44,10 +45,10 @@
  *      The runtime options data structure to pass through all insert_task calls.
  */
 static inline void
-chameleon_pzgetrf_panel_facto_nopiv( void             *ws,
-                                     CHAM_desc_t      *A,
-                                     int               k,
-                                     RUNTIME_option_t *options )
+chameleon_pzgetrf_panel_facto_nopiv( struct chameleon_pzgetrf_s *ws,
+                                     CHAM_desc_t                *A,
+                                     int                         k,
+                                     RUNTIME_option_t           *options )
 {
     const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0;
     int m, tempkm, tempkn, tempmm;
@@ -60,7 +61,7 @@ chameleon_pzgetrf_panel_facto_nopiv( void             *ws,
      */
     INSERT_TASK_zgetrf_nopiv(
         options,
-        tempkm, tempkn, 32, A->mb,
+        tempkm, tempkn, ws->ib, A->mb,
          A(k, k), 0);
 
     for (m = k+1; m < A->mt; m++) {
@@ -73,24 +74,61 @@ chameleon_pzgetrf_panel_facto_nopiv( void             *ws,
                   A(m, k) );
     }
 }
+
+static inline void
+chameleon_pzgetrf_panel_facto_nopiv_percol( struct chameleon_pzgetrf_s *ws,
+                                            CHAM_desc_t                *A,
+                                            int                         k,
+                                            RUNTIME_option_t           *options )
+{
+    int m, h;
+    int tempkm, tempkn, tempmm, minmn;
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    minmn  = chameleon_min( tempkm, tempkn );
+
+    /*
+     * Algorithm per column without pivoting
+     */
+    for(h=0; h<minmn; h++){
+        INSERT_TASK_zgetrf_panel_nopiv_percol_diag(
+            options, tempkm, tempkn, h,
+            A( k, k ), U( k, k ), A->mb * k );
+
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
+            INSERT_TASK_zgetrf_panel_nopiv_percol_trsm(
+                options, tempmm, tempkn, h,
+                A( m, k ), U( k, k ) );
+        }
+    }
+
+    RUNTIME_data_flush( options->sequence, U(k, k) );
+}
+
 static inline void
-chameleon_pzgetrf_panel_facto( void             *ws,
-                               CHAM_desc_t      *A,
-                               int               k,
-                               RUNTIME_option_t *options )
+chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
+                               CHAM_desc_t                *A,
+                               int                         k,
+                               RUNTIME_option_t           *options )
 {
+#if defined(GETRF_NOPIV_PER_COLUMN)
+    chameleon_pzgetrf_panel_facto_nopiv_percol( ws, A, k, options );
+#else
     chameleon_pzgetrf_panel_facto_nopiv( ws, A, k, options );
+#endif
 }
 
 /**
  *  Permutation of the panel n at step k
  */
 static inline void
-chameleon_pzgetrf_panel_permute( void             *ws,
-                                 CHAM_desc_t      *A,
-                                 int               k,
-                                 int               n,
-                                 RUNTIME_option_t *options )
+chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
+                                 CHAM_desc_t                *A,
+                                 int                         k,
+                                 int                         n,
+                                 RUNTIME_option_t           *options )
 {
     (void)ws;
     (void)A;
@@ -100,11 +138,11 @@ chameleon_pzgetrf_panel_permute( void             *ws,
 }
 
 static inline void
-chameleon_pzgetrf_panel_update( void             *ws,
-                                CHAM_desc_t      *A,
-                                int               k,
-                                int               n,
-                                RUNTIME_option_t *options )
+chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
+                                CHAM_desc_t                *A,
+                                int                         k,
+                                int                         n,
+                                RUNTIME_option_t           *options )
 {
     const CHAMELEON_Complex64_t zone  = (CHAMELEON_Complex64_t) 1.0;
     const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0;
@@ -141,10 +179,10 @@ chameleon_pzgetrf_panel_update( void             *ws,
 /**
  *  Parallel tile LU factorization with no pivoting - dynamic scheduling
  */
-void chameleon_pzgetrf( void               *ws,
-                        CHAM_desc_t        *A,
-                        RUNTIME_sequence_t *sequence,
-                        RUNTIME_request_t  *request )
+void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
+                        CHAM_desc_t              *A,
+                        RUNTIME_sequence_t       *sequence,
+                        RUNTIME_request_t        *request )
 {
     CHAM_context_t  *chamctxt;
     RUNTIME_option_t options;
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 52c33a740..c99ae3f80 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -4,7 +4,7 @@
  *
  * @copyright 2009-2014 The University of Tennessee and The University of
  *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
  *                      Univ. Bordeaux. All rights reserved.
  *
  ***
@@ -18,7 +18,7 @@
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Matthieu Kuhn
- * @date 2022-09-19
+ * @date 2023-02-21
  *
  * @precisions normal z -> s d c
  *
@@ -52,14 +52,26 @@
 void *
 CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
 {
-    CHAM_context_t *chamctxt;
+    CHAM_context_t           *chamctxt;
+    struct chameleon_pzgetrf_s *options;
 
     chamctxt = chameleon_context_self();
     if ( chamctxt == NULL ) {
         return NULL;
     }
 
-    return NULL;
+    options = calloc( 1, sizeof( struct chameleon_pzgetrf_s ) );
+    options->ib = CHAMELEON_IB;
+
+#if defined(GETRF_NOPIV_PER_COLUMN)
+    chameleon_desc_init( &(options->U), CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, 1, A->nb, A->nb,
+                         A->mt, A->nt * A->nb, 0, 0,
+                         A->mt, A->nt * A->nb, A->p, A->q,
+                         NULL, NULL, A->get_rankof_init );
+#endif
+
+    return options;
 }
 
 /**
@@ -84,14 +96,13 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
 void
 CHAMELEON_zgetrf_WS_Free( const CHAM_desc_t *A, void *user_ws )
 {
-    CHAM_context_t *chamctxt;
+    struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)user_ws;
 
-    chamctxt = chameleon_context_self();
-    if ( chamctxt == NULL ) {
-        return;
-    }
+#if defined(GETRF_NOPIV_PER_COLUMN)
+    chameleon_desc_destroy( &(ws->U) );
+#endif
 
-    (void)user_ws;
+    free( ws );
 }
 
 #if defined(NOT_AVAILABLE_YET)
diff --git a/control/compute_z.h b/control/compute_z.h
index b938b178f..bf9fdc8dc 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -21,7 +21,7 @@
  * @author Florent Pruvost
  * @author Alycia Lisito
  * @author Matthieu Kuhn
- * @date 2022-09-19
+ * @date 2023-02-21
  * @precisions normal z -> c d s
  *
  */
@@ -37,6 +37,14 @@ struct chameleon_pzgemm_s {
     CHAM_desc_t WB;
 };
 
+/**
+ * @brief Data structure to handle the GETRF workspaces with partial pivoting
+ */
+struct chameleon_pzgetrf_s {
+    int         ib; /* Internal blocking parameter */
+    CHAM_desc_t U;
+};
+
 /**
  * @brief Data structure to handle the Centering-Scaled workspaces
  */
@@ -78,7 +86,7 @@ void chameleon_pzgepdf_qdwh( cham_mtxtype_t trans, CHAM_desc_t *descU, CHAM_desc
 void chameleon_pzgepdf_qr( int genD, int doqr, int optid, const libhqr_tree_t *qrtreeT, const libhqr_tree_t *qrtreeB, CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *D1, CHAM_desc_t *Q1, CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *D2, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-void chameleon_pzgetrf( void *ws, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
+void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzgetrf_incpiv(CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-- 
GitLab