From fb6ccd8c0fde2b3abf284d518e93fb7511303edb Mon Sep 17 00:00:00 2001
From: Alycia Lisito <alycia.lisito@inria.fr>
Date: Wed, 14 Feb 2024 16:17:40 +0100
Subject: [PATCH] zgetrf batched: add batched percol algorithm

---
 compute/pzgetrf.c        | 58 +++++++++++++++++++++++++++++++++++++++-
 testing/CTestLists.cmake |  6 +++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 99152bc04..000209624 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -157,6 +157,57 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
     RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
 }
 
+/*
+ *  Factorization of panel k - dynamic scheduling - batched version / stock
+ */
+static inline void
+chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
+                                              CHAM_desc_t                *A,
+                                              CHAM_ipiv_t                *ipiv,
+                                              int                         k,
+                                              RUNTIME_option_t           *options )
+{
+    int m, h;
+    int tempkm, tempkn, tempmm, minmn;
+    void **clargs = malloc( sizeof(char *) );
+    memset( clargs, 0, sizeof(char *) );
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    minmn  = chameleon_min( tempkm, tempkn );
+
+    /* Update the number of column */
+    ipiv->n = minmn;
+
+    /*
+     * Algorithm per column with pivoting (no recursion)
+     */
+    /* Iterate on current panel column */
+    /* Since index h scales column h-1, we need to iterate up to minmn (included) */
+    for ( h = 0; h <= minmn; h++ ) {
+
+        INSERT_TASK_zgetrf_percol_diag( options, tempkm, tempkn, h, k * A->mb, A(k, k), ipiv );
+
+        for ( m = k+1; m < A->mt; m++ ) {
+            tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
+            INSERT_TASK_zgetrf_panel_offdiag_batched( options, tempmm, tempkn, h, m * A->mb,
+                                                      (void *)ws, A(m, k), clargs, ipiv );
+        }
+        INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv );
+
+        if ( h < minmn ) {
+            /* Reduce globally (between MPI processes) */
+            INSERT_TASK_ipiv_reducek( options, ipiv, k, h );
+        }
+    }
+
+    free( clargs );
+
+    /* Flush temporary data used for the pivoting */
+    INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+}
+
 static inline void
 chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
                                        CHAM_desc_t                *A,
@@ -235,7 +286,12 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPivPerColumn:
-        chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options );
+        if ( ws->batch_size > 1 ) {
+            chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options );
+        }
+        else {
+            chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options );
+        }
         break;
 
     case ChamGetrfPPiv:
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index b429b5c02..98bdb1939 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -94,6 +94,12 @@ if (NOT CHAMELEON_SIMULATION)
             set_tests_properties( test_${cat}_${prec}getrf_ppivpercol
                                 PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" )
 
+            if ( ${cat} STREQUAL "shm" )
+                add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
+                set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch
+                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" )
+            endif()
+
             add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
             set_tests_properties( test_${cat}_${prec}getrf_ppiv
                                 PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=1" )
-- 
GitLab