diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index f444409b8279007d43d810f6b603595c414f9819..90b4578d47eadeab248d9d47cb45a4a93b74a1b2 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -25,7 +25,7 @@
  * @author Romain Peressoni
  * @author Matthieu Kuhn
  * @author Ana Hourcau
- * @date 2024-11-12
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -188,12 +188,12 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
 void INSERT_TASK_zlaset2( const RUNTIME_option_t *options,
                           cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
                           const CHAM_desc_t *tileA, int tileAm, int tileAn );
-void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, cham_dir_t dir,
                              int m0, int k,
                              const CHAM_ipiv_t *tIPIV, int tIPIVk,
                              const CHAM_desc_t *tileA, int tileAm, int tileAn,
                              const CHAM_desc_t *tileB, int tileBm, int tileBn );
-void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, cham_dir_t dir,
                              int m0, int k,
                              const CHAM_ipiv_t *tIPIV, int tIPIVk,
                              const CHAM_desc_t *tileA, int tileAm, int tileAn,
@@ -588,15 +588,20 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile
- * U(Um, Un) according to the permutation ipiv. This task is used in the LU
- * factorization with partial pivoting.
+ *  @brief Perfoms an allreduce operation on the tile
+ *  U(Um, Un) according to the permutation ipiv. This task is used in the LU
+ *  factorization with partial pivoting.
  *
  *******************************************************************************
  *
  * @param[in] options
  *          The runtime options data structure to pass through all insert_task calls.
  *
+ * @param[in] dir
+ *          Specifies the order of the permutation.
+ *          = ChamDirForward:  Natural order
+ *          = ChamDirBackward: Reverse order
+ *
  * @param[in] A
  *          The descriptor of the matrix A.
  *
@@ -630,6 +635,7 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
  *******************************************************************************
  */
 void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                                  cham_dir_t              dir,
                                   const CHAM_desc_t      *A,
                                   CHAM_desc_t            *U,
                                   int                     Um,
@@ -645,9 +651,9 @@ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus
- * involved in the permutation. This task is used in the LU factorization with
- * partial pivoting.
+ *  @brief Sends the tile A(Am, An) to the processus
+ *  involved in the permutation. This task is used in the LU factorization with
+ *  partial pivoting.
  *
  *******************************************************************************
  *
@@ -687,15 +693,20 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the
- * processus involved in the permutation. This task is used in the LU
- * factorization with partial pivoting.
+ *  @brief - Sends the permutation ipivk to the
+ *  processus involved in the permutation. This task is used in the LU
+ *  factorization with partial pivoting.
  *
  *******************************************************************************
  *
  * @param[in] options
  *          The runtime options data structure to pass through all insert_task calls.
  *
+ * @param[in] dir
+ *          Specifies the order of the permutation.
+ *          = ChamDirForward:  Natural order
+ *          = ChamDirBackward: Reverse order
+ *
  * @param[in] ipiv
  *          The pivot structure that contains the informations for the LU
  *          factorization with partial pivoting.
@@ -715,6 +726,7 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
  *******************************************************************************
  */
 void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                            cham_dir_t              dir,
                                             CHAM_ipiv_t            *ipiv,
                                             int                     ipivk,
                                             int                     myrank,
@@ -726,15 +738,20 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk
- * to the processus involved in the permutation. This task is used in the LU
- * factorization with partial pivoting.
+ *  @brief Sends the inverse permutation ipivk
+ *  to the processus involved in the permutation. This task is used in the LU
+ *  factorization with partial pivoting.
  *
  *******************************************************************************
  *
  * @param[in] options
  *          The runtime options data structure to pass through all insert_task calls.
  *
+ * @param[in] dir
+ *          Specifies the order of the permutation.
+ *          = ChamDirForward:  Natural order
+ *          = ChamDirBackward: Reverse order
+ *
  * @param[in] ipiv
  *          The pivot structure that contains the informations for the LU
  *          factorization with partial pivoting.
@@ -754,6 +771,7 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
  *******************************************************************************
  */
 void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                            cham_dir_t              dir,
                                             CHAM_ipiv_t            *ipiv,
                                             int                     ipivk,
                                             const CHAM_desc_t      *A,
@@ -761,3 +779,4 @@ void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
                                             int                     n );
 
 #endif /* _chameleon_tasks_z_h_ */
+
diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c
index bce58c771ef3052ce4d20d16232082cd9a746f66..93bf20aef11964fa548adb7739b000af575b04ba 100644
--- a/runtime/openmp/codelets/codelet_zlaswp.c
+++ b/runtime/openmp/codelets/codelet_zlaswp.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2024-02-18
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -20,7 +20,7 @@
 #include "coreblas/coreblas_ztile.h"
 
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *U, int Um, int Un )
@@ -38,10 +38,11 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
     }
 
     (void)options;
+    (void)dir;
 }
 
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *B, int Bm, int Bn )
@@ -59,4 +60,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
     }
 
     (void)options;
+    (void)dir;
 }
diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c
index eac34fdfd1f8a0814c277f7acb8a9b85cb594ec7..8b20a60fd43332dac7373edcb2de40ee552d050a 100644
--- a/runtime/openmp/codelets/codelet_zperm_allreduce.c
+++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Alycia Lisito
- * @date 2024-11-12
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        int                     myrank,
@@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
                                        int                    *proc_involved  )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)myrank;
@@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        const CHAM_desc_t      *A,
@@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
                                        int                     n )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)A;
@@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             cham_dir_t              dir,
                              const CHAM_desc_t      *A,
                              CHAM_desc_t            *U,
                              int                     Um,
@@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              void                   *ws )
 {
     (void)options;
+    (void)dir;
     (void)A;
     (void)U;
     (void)Um;
diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c
index 12aaf7089ff41f4e4090e0fb6f18e518c9813fd3..65849c96d1aae96cc1000dd93e5efbebe481c7d9 100644
--- a/runtime/parsec/codelets/codelet_zlaswp.c
+++ b/runtime/parsec/codelets/codelet_zlaswp.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2024-02-18
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -33,7 +33,7 @@ CORE_zlaswp_get_parsec( parsec_execution_stream_t *context,
 }
 
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *U, int Um, int Un )
@@ -54,6 +54,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
         sizeof(int),         &(tileU->ld), VALUE,
         PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT,
         PARSEC_DTD_ARG_END );
+
+    (void)dir;
 }
 
 static inline int
@@ -70,7 +72,7 @@ CORE_zlaswp_set_parsec( parsec_execution_stream_t *context,
 }
 
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *B, int Bm, int Bn )
@@ -91,4 +93,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
         sizeof(int),         &(tileB->ld), VALUE,
         PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT,
         PARSEC_DTD_ARG_END );
+
+    (void)dir;
 }
diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c
index 9ceb440c8a4e677630a68355daa7defda7f904fa..f68148e24b5e4c4e7d42d7248c8bf4a9948477c6 100644
--- a/runtime/parsec/codelets/codelet_zperm_allreduce.c
+++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Alycia Lisito
- * @date 2024-11-12
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        int                     myrank,
@@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
                                        int                    *proc_involved  )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)myrank;
@@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        const CHAM_desc_t      *A,
@@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
                                        int                     n )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)A;
@@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             cham_dir_t              dir,
                              const CHAM_desc_t      *A,
                              CHAM_desc_t            *U,
                              int                     Um,
@@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              void                   *ws )
 {
     (void)options;
+    (void)dir;
     (void)A;
     (void)U;
     (void)Um;
diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c
index 176dd16916eb51e1b698ad0d17dbd0d37c1a1d61..8f5a1b57fd52bd2e401273171584ebcca1478e50 100644
--- a/runtime/quark/codelets/codelet_zlaswp.c
+++ b/runtime/quark/codelets/codelet_zlaswp.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2024-02-18
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -30,7 +30,7 @@ static void CORE_zlaswp_get_quark( Quark *quark )
 }
 
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *U, int Um, int Un )
@@ -46,6 +46,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
         sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
         sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT,
         0 );
+
+    (void)dir;
 }
 
 static void CORE_zlaswp_set_quark( Quark *quark )
@@ -59,7 +61,7 @@ static void CORE_zlaswp_set_quark( Quark *quark )
 }
 
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *B, int Bm, int Bn )
@@ -75,4 +77,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
         sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
         sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT,
         0 );
+
+    (void)dir;
 }
diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c
index f297d343b33455ba6340f0b81c45e8d01d29600f..1a2a7089c8addc5715d074a6c04bc5e8732aed1b 100644
--- a/runtime/quark/codelets/codelet_zperm_allreduce.c
+++ b/runtime/quark/codelets/codelet_zperm_allreduce.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Alycia Lisito
- * @date 2024-11-12
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        int                     myrank,
@@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
                                        int                    *proc_involved  )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)myrank;
@@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        const CHAM_desc_t      *A,
@@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
                                        int                     n )
 {
     (void)options;
+    (void)dir;
     (void)ipiv;
     (void)ipivk;
     (void)A;
@@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             cham_dir_t              dir,
                              const CHAM_desc_t      *A,
                              CHAM_desc_t            *U,
                              int                     Um,
@@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              void                   *ws )
 {
     (void)options;
+    (void)dir;
     (void)A;
     (void)U;
     (void)Um;
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
index 81c28d92f05d6c23e85e743b8402b79db31815b1..3829763abd896ca9db917a9d0573ac4d9b9b5255 100644
--- a/runtime/starpu/codelets/codelet_zlaswp.c
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -13,7 +13,7 @@
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
  * @author Alycia Lisito
- * @date 2024-11-12
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -48,11 +48,12 @@ CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func )
 #if defined(CHAMELEON_STARPU_USE_INSERT)
 
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *U, int Um, int Un )
 {
+    void                  *ipiv_handle;
     struct starpu_codelet *codelet = &cl_zlaswp_get;
     if ( A->get_rankof( A, Am, An) != A->myrank ) {
         return;
@@ -63,12 +64,18 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
     clargs->m0 = m0;
     clargs->k  = k;
 
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL;
 
     rt_starpu_insert_task(
         codelet,
         STARPU_CL_ARGS,             clargs, sizeof(struct cl_zlaswp_args_s),
-        STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_R,                   ipiv_handle,
         STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, Am, An),
         STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
         STARPU_PRIORITY,            options->priority,
@@ -80,18 +87,26 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
 #else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *U, int Um, int Un )
 {
-    int ret;
+    int                 ret;
     struct starpu_task *task;
+    void               *ipiv_handle;
 
     if ( A->get_rankof( A, Am, An) != A->myrank ) {
         return;
     }
 
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
+
     INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_get, zlaswp_get, zlaswp, 3);
 
     /*
@@ -99,8 +114,7 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
      */
     starpu_cham_exchange_init_params( options, &params, U->get_rankof( U, Um, Un ) );
     starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
-                                                  RUNTIME_perm_getaddr( ipiv, ipivk ),
-                                                  STARPU_R );
+                                                  ipiv_handle, STARPU_R );
     starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R );
     starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ),
                                 STARPU_RW | STARPU_COMMUTE );
@@ -157,12 +171,14 @@ static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg )
 CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func )
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *B, int Bm, int Bn )
 {
+    void                  *ipiv_handle;
     struct starpu_codelet *codelet = &cl_zlaswp_set;
     if ( B->get_rankof( B, Bm, Bn) != A->myrank ) {
         return;
@@ -173,12 +189,19 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
     clargs->m0 = m0;
     clargs->k  = k;
 
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL;
 
     rt_starpu_insert_task(
         codelet,
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_zlaswp_args_s),
-        STARPU_R,                 RUNTIME_invp_getaddr( ipiv, ipivk ),
+        STARPU_R,                 ipiv_handle,
         STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
         STARPU_RW,                RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
         STARPU_PRIORITY,          options->priority,
@@ -186,20 +209,30 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         0 );
 }
-#else
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
-                             int m0, int k,
+                             cham_dir_t dir, int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
                              const CHAM_desc_t *A, int Am, int An,
                              const CHAM_desc_t *B, int Bm, int Bn )
 {
-    int ret;
+    int                 ret;
     struct starpu_task *task;
+    void               *ipiv_handle;
 
     if ( B->get_rankof( B, Bm, Bn) != A->myrank ) {
         return;
     }
 
+    if( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+
     INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_set, zlaswp_set, zlaswp, 3);
 
     /*
@@ -207,8 +240,7 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
      */
     starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
     starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
-                                                  RUNTIME_invp_getaddr( ipiv, ipivk ),
-                                                  STARPU_R );
+                                                  ipiv_handle, STARPU_R );
     starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R );
     starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( B, ChamComplexDouble, Bm, Bn ), STARPU_RW );
 
@@ -242,4 +274,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
     }
     starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
 }
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c
index e32b7ad9c46a2303eb1c4c6a18d442935fca6d3a..a479056c5f9321b75cd89a99349fd1ef1c3f3976 100644
--- a/runtime/starpu/codelets/codelet_zperm_allreduce.c
+++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Alycia Lisito
  * @author Pierre Esterie
- * @date 2024-11-14
+ * @date 2025-03-24
  * @precisions normal z -> c d s
  *
  */
@@ -21,6 +21,7 @@
 #include <coreblas/cblas_wrapper.h>
 
 #if defined(CHAMELEON_USE_MPI)
+
 struct cl_redux_args_s {
     int tempmm;
     int mb;
@@ -91,6 +92,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options,
 
 static void
 INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
+                                  cham_dir_t              dir,
                                   CHAM_desc_t            *U,
                                   CHAM_ipiv_t            *ipiv,
                                   int                     ipivk,
@@ -105,6 +107,15 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
                                   int                     p_first )
 {
     struct cl_redux_args_s *clargs;
+    void                   *ipiv_handle;
+
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
+
     clargs = malloc( sizeof( struct cl_redux_args_s ) );
     clargs->tempmm  = tempmm;
     clargs->mb      = U->mb;
@@ -121,7 +132,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_s),
         STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, me,  n),
         STARPU_R,                 RTBLKADDR(U, CHAMELEON_Complex64_t, src, n),
-        STARPU_R,                 RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_R,                 ipiv_handle,
         STARPU_EXECUTE_ON_NODE,   me,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         STARPU_PRIORITY,          options->priority,
@@ -151,6 +162,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options,
 
 static void
 INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
+                                  cham_dir_t              dir,
                                   CHAM_desc_t            *U,
                                   CHAM_ipiv_t            *ipiv,
                                   int                     ipivk,
@@ -164,8 +176,16 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
                                   int                     np,
                                   int                     p_first )
 {
-    int ret;
+    int                 ret;
     struct starpu_task *task;
+    void               *ipiv_handle
+
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
 
     INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zperm_allreduce_send, zperm_allreduce, redux, 3 );
 
@@ -176,7 +196,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
     starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
                                                   RTBLKADDR( U, ChamComplexDouble, src, n ),
                                                   STARPU_R );
-    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, ipiv_handle, STARPU_R );
 
     task = starpu_task_create();
     task->cl = cl;
@@ -221,6 +241,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
 
 static void
 zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
+                                       cham_dir_t                  dir,
                                        const CHAM_desc_t          *A,
                                        CHAM_desc_t                *U,
                                        int                         Um,
@@ -229,10 +250,10 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
                                        int                         ipivk,
                                        int                         k,
                                        int                         n,
-                                       struct chameleon_pzgetrf_s *ws)
+                                       struct chameleon_pzgetrf_s *ws )
 {
     int *proc_involved = ws->proc_involved;
-    int  np_involved   = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k);
+    int  np_involved   = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k );
     int  np_iter       = np_involved;
     int  p_recv, p_send, me, p_first;
     int  shift = 1;
@@ -253,9 +274,11 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
             p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ];
 
             INSERT_TASK_zperm_allreduce_send( options, U, A->myrank, p_send, n );
-            INSERT_TASK_zperm_allreduce_recv( options, U, ipiv, ipivk, A->myrank, p_recv,
+            INSERT_TASK_zperm_allreduce_recv( options, dir, U, ipiv, ipivk, A->myrank, p_recv,
                                               n, k == (A->mt-1) ? A->m - k * A->mb : A->mb,
-                                              chameleon_desc_datadist_get_iparam(A, 0), chameleon_desc_datadist_get_iparam(A, 1), shift, np_involved, p_first );
+                                              chameleon_desc_datadist_get_iparam(A, 0),
+                                              chameleon_desc_datadist_get_iparam(A, 1),
+                                              shift, np_involved, p_first );
 
             shift   = shift << 1;
             np_iter = chameleon_ceil( np_iter, 2 );
@@ -265,6 +288,7 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
 
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             cham_dir_t              dir,
                              const CHAM_desc_t      *A,
                              CHAM_desc_t            *U,
                              int                     Um,
@@ -280,7 +304,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
     switch( alg ) {
     case ChamStarPUTasks:
     default:
-        zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp );
+        zperm_allreduce_chameleon_starpu_task( options, dir, A, U, Um, Un, ipiv, ipivk, k, n, tmp );
     }
 }
 
@@ -307,33 +331,51 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        int                     myrank,
                                        int                     np,
                                        int                    *proc_involved )
 {
-    int p;
+    int   p;
+    void *ipiv_handle;
+
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
 
     for ( p = 0; p < np; p++ ) {
         if ( proc_involved[ p ] == myrank ) {
             continue;
         }
         starpu_mpi_get_data_on_node_detached( options->sequence->comm,
-                                              RUNTIME_perm_getaddr( ipiv, ipivk ),
+                                              ipiv_handle,
                                               proc_involved[ p ], NULL, NULL );
     }
 }
 
 void
 INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        const CHAM_desc_t      *A,
                                        int                     k,
                                        int                     n )
 {
-    int b, rank;
+    int   b, rank;
+    void *ipiv_handle;
+
+    if ( dir == ChamDirForward ) {
+        ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk );
+    }
+    else {
+        ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk );
+    }
 
     for ( b = k+1; (b < A->mt) && ((b-(k+1)) < chameleon_desc_datadist_get_iparam(A, 0)); b ++ ) {
         rank = A->get_rankof( A, b, n );
@@ -341,10 +383,11 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
             continue;
         }
         starpu_mpi_get_data_on_node_detached( options->sequence->comm,
-                                              RUNTIME_invp_getaddr( ipiv, ipivk ),
+                                              ipiv_handle,
                                               rank, NULL, NULL );
     }
 }
+
 #else
 void
 INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
@@ -353,7 +396,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
                                     int                     An,
                                     int                     myrank,
                                     int                     np,
-                                    int                    *proc_involved  )
+                                    int                    *proc_involved )
 {
     (void)options;
     (void)A;
@@ -366,11 +409,12 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        int                     myrank,
                                        int                     np,
-                                       int                    *proc_involved  )
+                                       int                    *proc_involved )
 {
     (void)options;
     (void)ipiv;
@@ -382,6 +426,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       cham_dir_t              dir,
                                        CHAM_ipiv_t            *ipiv,
                                        int                     ipivk,
                                        const CHAM_desc_t      *A,
@@ -398,6 +443,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             cham_dir_t              dir,
                              const CHAM_desc_t      *A,
                              CHAM_desc_t            *U,
                              int                     Um,
@@ -419,4 +465,5 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
     (void)n;
     (void)ws;
 }
+
 #endif