From 6696f4ca85968edaba82a1fba576cd3ebe02d88f Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Mon, 17 Mar 2025 17:00:02 +0100
Subject: [PATCH] laswp: Add pzlaswp

---
 compute/CMakeLists.txt |   4 +-
 compute/pzlaswp.c      | 146 +++++++++++++++++++++++++++++++++++++++++
 control/compute_z.h    |   4 +-
 3 files changed, 150 insertions(+), 4 deletions(-)
 create mode 100644 compute/pzlaswp.c

diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 5626c253c..a5d95ed43 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -28,7 +28,7 @@
 #  @author Loris Lucido
 #  @author Matthieu Kuhn
 #  @author Ana Hourcau
-#  @date 2024-09-18
+#  @date 2025-03-24
 #
 ###
 
@@ -219,7 +219,7 @@ set(ZSRC
     #pzhetrd_hb2ht.c
     pzhetrd_he2hb.c
     #pzlarft_blgtrd.c
-    #pzlaswp.c
+    pzlaswp.c
     #pzlaswpc.c
     #pztrsmrv.c
     #pzunmqr_blgtrd.c
diff --git a/compute/pzlaswp.c b/compute/pzlaswp.c
new file mode 100644
index 000000000..0d4aa8694
--- /dev/null
+++ b/compute/pzlaswp.c
@@ -0,0 +1,146 @@
+/**
+ *
+ * @file pzlaswp.c
+ *
+ * @copyright 2025-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zlaswp parallel algorithm
+ *
+ * @version 1.3.0
+ * @comment This file has been automatically generated
+ *          from Plasma 2.5.0 for CHAMELEON 0.9.2
+ * @author Alycia Lisito
+ * @author Matteo Marcos
+ * @date 2025-03-24
+ * @precisions normal z -> s d c
+ *
+ */
+#include "control/common.h"
+
+#define A(m,n)   A,         m, n
+#define Wu(m,n)  &(ws->Wu), m, n
+
+/**
+ *  Permutation of the panel n at step k
+ */
+static inline void
+chameleon_pzlaswp_panel_permute( struct chameleon_pzgetrf_s *ws,
+                                 cham_dir_t                  dir,
+                                 CHAM_desc_t                *A,
+                                 CHAM_ipiv_t                *ipiv,
+                                 int                         k,
+                                 int                         n,
+                                 RUNTIME_option_t           *options )
+{
+    int m;
+    int tempkm, tempnn;
+    int withlacpy;
+
+    tempkm = A->get_blkdim( A, k, DIM_m, A->m );
+    tempnn = A->get_blkdim( A, n, DIM_n, A->n );
+
+    /* Extract selected rows into U */
+    withlacpy = options->withlacpy;
+    options->withlacpy = 1;
+    INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                       A(k, n), Wu(A->myrank, n) );
+    options->withlacpy = withlacpy;
+
+    INSERT_TASK_zlaswp_get( options, dir, k*A->mb, tempkm,
+                           ipiv, k, A(k, n), Wu(A->myrank, n) );
+
+    for ( m = k + 1; m < A->mt; m++ ) {
+        /* Extract selected rows into A(k, n) */
+        INSERT_TASK_zlaswp_get( options, dir, m*A->mb, tempkm,
+                               ipiv, k, A(m, n), Wu(A->myrank, n) );
+        /* Copy rows from A(k,n) into their final position */
+        INSERT_TASK_zlaswp_set( options, dir, m*A->mb, tempkm,
+                               ipiv, k, A(k, n), A(m, n) );
+    }
+
+    INSERT_TASK_zperm_allreduce( options, dir, A, Wu(A->myrank, n), ipiv, k, k, n, ws );
+}
+
+static inline void
+chameleon_pzlaswp_panel( struct chameleon_pzgetrf_s *ws,
+                         cham_dir_t                  dir,
+                         CHAM_desc_t                *A,
+                         CHAM_ipiv_t                *ipiv,
+                         int                         k,
+                         int                         n,
+                         RUNTIME_option_t           *options,
+                         RUNTIME_sequence_t         *sequence )
+{
+    int tempkm, tempnn;
+
+#if defined(CHAMELEON_USE_MPI)
+    chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
+    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+        INSERT_TASK_zperm_allreduce_send_perm( options, dir, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
+        INSERT_TASK_zperm_allreduce_send_invp( options, dir, ipiv, k, A, k, n );
+    }
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+        INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
+    }
+
+    if ( !ws->involved ) {
+        return;
+    }
+#endif
+
+    chameleon_pzlaswp_panel_permute( ws, dir, A, ipiv, k, n, options );
+
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+
+        tempkm = A->get_blkdim( A, k, DIM_m, A->m );
+        tempnn = A->get_blkdim( A, n, DIM_n, A->n );
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            Wu(A->myrank, n), A(k, n) );
+        RUNTIME_data_flush( sequence, A(k, n) );
+    }
+}
+
+void
+chameleon_pzlaswp( struct chameleon_pzgetrf_s *ws,
+                   cham_dir_t                  dir,
+                   CHAM_desc_t                *A,
+                   CHAM_ipiv_t                *IPIV,
+                   RUNTIME_sequence_t         *sequence,
+                   RUNTIME_request_t          *request )
+{
+    CHAM_context_t   *chamctxt;
+    RUNTIME_option_t  options;
+
+    int n, k;
+
+    chamctxt = chameleon_context_self();
+    if ( sequence->status != CHAMELEON_SUCCESS ) {
+        return;
+    }
+    RUNTIME_options_init( &options, chamctxt, sequence, request );
+
+    if ( dir == ChamDirForward ) {
+        for ( k = 0; k < A->mt; k++ ) {
+            for ( n = 0; n < A->nt; n++ ) {
+                options.priority = A->nt-n;
+
+                chameleon_pzlaswp_panel( ws, dir, A, IPIV, k, n, &options, sequence );
+            }
+            RUNTIME_perm_flushk( sequence, IPIV, k );
+        }
+    }
+    else {
+        for ( k = A->mt - 1; k > -1; k-- ) {
+            for ( n = 0; n < A->nt; n++ ) {
+                options.priority = A->nt-n;
+                chameleon_pzlaswp_panel( ws, dir, A, IPIV, k, n, &options, sequence );
+            }
+            RUNTIME_perm_flushk( sequence, IPIV, k );
+        }
+    }
+    RUNTIME_options_finalize( &options, chamctxt );
+}
+
diff --git a/control/compute_z.h b/control/compute_z.h
index 812af3dce..72f4504a4 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -24,7 +24,7 @@
  * @author Lionel Eyraud-Dubois
  * @author Ana Hourcau
  * @author Pierre Esterie
- * @date 2024-12-09
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -172,7 +172,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
 void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzlaset( cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzlaset2(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha,                          CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-void chameleon_pzlaswp(CHAM_desc_t *B, int *IPIV, int inc, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+void chameleon_pzlaswp( struct chameleon_pzgetrf_s *ws, cham_dir_t dir, CHAM_desc_t *A, CHAM_ipiv_t *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzlaswpc(CHAM_desc_t *B, int *IPIV, int inc, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzlatms( cham_dist_t idist, unsigned long long int seed, cham_sym_t sym, double *D, int mode, double cond, double dmax, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-- 
GitLab