diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index afd17c16f2a60d1b5cb35616151072872fdb3de2..892e1405401236a94252ff1d3281d65a571e0880 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -52,6 +52,7 @@ _extra_blas = [
     ('',                     'sgered',               'dgered',               'cgered',               'zgered'              ),
     ('',                     'sgerst',               'dgerst',               'cgerst',               'zgerst'              ),
     ('',                     'sipiv_allreduce',      'dipiv_allreduce',      'cipiv_allreduce',      'zipiv_allreduce'     ),
+    ('',                     'sperm_allreduce',      'dperm_allreduce',      'cperm_allreduce',      'zperm_allreduce'     ),
 ]
 
 _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ]
diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index d95b415d9bbe5473f38be0d3b304c7c3c898adf1..5b63d1e010f2d12dab9eef167f12cf94b8e0e4a4 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -26,6 +26,7 @@
 #define A(m,n)  A,        m, n
 #define U(m,n)  &(ws->U), m, n
 #define Up(m,n)  &(ws->Up), m, n
+#define Wu(m,n)  &(ws->Wu), m, n
 
 /*
  * All the functions below are panel factorization variant.
@@ -214,10 +215,6 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
     int m, h, b, nbblock;
     int tempkm, tempkn, tempmm, minmn;
 
-    if ( ! ws->involved ) {
-        return;
-    }
-
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
     minmn  = chameleon_min( tempkm, tempkn );
@@ -340,25 +337,10 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
                                int                         k,
                                RUNTIME_option_t           *options )
 {
-#if defined ( CHAMELEON_USE_MPI )
-    int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) );
-    int  b;
-
-    /* 2DBC only */
-    ws->involved = 0;
-    for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) {
-        int rank = chameleon_getrankof_2d( A, b, k );
-        proc_involved[ b-k ] = rank;
-        if ( rank == A->myrank ) {
-            ws->involved = 1;
-        }
-    }
-    ws->proc_involved = proc_involved;
-    if ( ws->involved == 0 ) {
-	free( proc_involved );
+    chameleon_get_proc_involved_in_panelk_2dbc( A, k, k, ws );
+    if ( !ws->involved ) {
         return;
     }
-#endif
 
     /* TODO: Should be replaced by a function pointer */
     switch( ws->alg ) {
@@ -388,9 +370,6 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
     default:
         chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options );
     }
-#if defined ( CHAMELEON_USE_MPI )
-    free( proc_involved );
-#endif
 }
 
 /**
@@ -411,6 +390,19 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
         int m;
         int tempkm, tempkn, tempnn, minmn;
 
+        chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
+        if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+            INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
+            INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
+        }
+        if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+            INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
+        }
+
+        if ( !ws->involved ) {
+            return;
+        }
+
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
         tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
@@ -418,28 +410,26 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
 
         /* Extract selected rows into U */
         INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
-                            A(k, n), U(k, n) );
+                            A(k, n), Wu(A->myrank, n) );
 
         /*
          * perm array is made of size tempkm for the first row especially.
          * Otherwise, the final copy back to the tile may copy only a partial tile
          */
         INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm,
-                                ipiv, k, A(k, n), U(k, n) );
+                                ipiv, k, A(k, n), Wu(A->myrank, n) );
 
         for(m=k+1; m<A->mt; m++){
             /* Extract selected rows into A(k, n) */
             INSERT_TASK_zlaswp_get( options, m*A->mb, minmn,
-                                    ipiv, k, A(m, n), U(k, n) );
+                                    ipiv, k, A(m, n), Wu(A->myrank, n) );
             /* Copy rows from A(k,n) into their final position */
             INSERT_TASK_zlaswp_set( options, m*A->mb, minmn,
                                     ipiv, k, A(k, n), A(m, n) );
         }
 
-        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
-                            U(k, n), A(k, n) );
-
-        RUNTIME_data_flush( options->sequence, U(k, n) );
+        INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n,
+                                     Wu(A->myrank, n), ws );
     }
     break;
     default:
@@ -462,6 +452,20 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
     {
         int m;
         int tempkm, tempkn, tempnn, minmn;
+
+        chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
+        if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+            INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
+            INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
+        }
+        if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+            INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
+        }
+
+        if ( !ws->involved ) {
+            return;
+        }
+
         void **clargs = malloc( sizeof(char *) );
         *clargs = NULL;
 
@@ -472,25 +476,23 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
 
         /* Extract selected rows into U */
         INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
-                            A(k, n), U(k, n) );
+                            A(k, n), Wu(A->myrank, n) );
 
         /*
          * perm array is made of size tempkm for the first row especially.
          * Otherwise, the final copy back to the tile may copy only a partial tile
          */
         INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm,
-                                ipiv, k, A(k, n), U(k, n) );
+                                ipiv, k, A(k, n), Wu(A->myrank, n) );
 
         for(m=k+1; m<A->mt; m++){
-            INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, k, m, n, (void *)ws,
-                                        ipiv, k, A, &(ws->U), clargs );
+            INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, (void *)ws, ipiv, k,
+                                        A(m, n), A(k, n), Wu(A->myrank, n), clargs );
         }
-        INSERT_TASK_zlaswp_batched_flush( options, k, n, ipiv, k, A, &(ws->U), clargs );
+        INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs );
 
-        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
-                            U(k, n), A(k, n) );
+        INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, Wu(A->myrank, n), ws );
 
-        RUNTIME_data_flush( options->sequence, U(k, n) );
         free( clargs );
     }
     break;
@@ -510,7 +512,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     const CHAMELEON_Complex64_t zone  = (CHAMELEON_Complex64_t) 1.0;
     const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0;
 
-    int m, tempkm, tempmm, tempnn;
+    int m, tempkm, tempmm, tempnn, rankAmn, p;
 
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
@@ -522,25 +524,44 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
         chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
     }
 
-    INSERT_TASK_ztrsm(
-        options,
-        ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
-        tempkm, tempnn, A->mb,
-        zone, A(k, k),
-              A(k, n) );
+    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+        for ( p = 0; p < ws->np_involved; p++ ) {
+            INSERT_TASK_ztrsm(
+                options,
+                ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
+                tempkm, tempnn, A->mb,
+                zone, A(k, k),
+                      Wu(ws->proc_involved[p], n) );
+        }
+    }
+    else if ( ws->involved ) {
+        INSERT_TASK_ztrsm(
+            options,
+            ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
+            tempkm, tempnn, A->mb,
+            zone, A(k, k),
+                  Wu(A->myrank, n) );
+    }
 
     for (m = k+1; m < A->mt; m++) {
         tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+        rankAmn = A->get_rankof( A, m, n );
 
         INSERT_TASK_zgemm(
             options,
             ChamNoTrans, ChamNoTrans,
             tempmm, tempnn, A->mb, A->mb,
             mzone, A(m, k),
-                   A(k, n),
+                   Wu(rankAmn, n),
             zone,  A(m, n) );
     }
 
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            Wu(A->myrank, n), A(k, n) );
+    }
+
+    RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) );
     RUNTIME_data_flush( options->sequence, A(k, n) );
 }
 
@@ -556,7 +577,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
     CHAM_context_t  *chamctxt;
     RUNTIME_option_t options;
 
-    int k, m, n;
+    int k, m, n, tempkm, tempnn;
     int min_mnt = chameleon_min( A->mt, A->nt );
 
     chamctxt = chameleon_context_self();
@@ -581,7 +602,11 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
 
         for (n = k+1; n < A->nt; n++) {
             options.priority = A->nt-n;
-            chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options );
+            if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
+                 chameleon_involved_in_panelk_2dbc( A, n ) )
+            {
+                chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options );
+            }
         }
 
         /* Flush panel k */
@@ -596,7 +621,19 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
     if ( ws->batch_size > 0 ) {
         for (k = 1; k < min_mnt; k++) {
             for (n = 0; n < k; n++) {
-                chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options );
+                if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
+                    chameleon_involved_in_panelk_2dbc( A, n ) )
+                {
+                    chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options );
+                    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+                        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+                        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+                        INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn,
+                                            Wu(A->myrank, n), A(k, n) );
+                        RUNTIME_data_flush( sequence, A(k, n) );
+                    }
+                }
+                RUNTIME_data_flush( sequence, Wu(A->myrank, n) );
             }
             RUNTIME_perm_flushk( sequence, IPIV, k );
         }
@@ -604,7 +641,19 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
     else {
         for (k = 1; k < min_mnt; k++) {
             for (n = 0; n < k; n++) {
-                chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
+                if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
+                    chameleon_involved_in_panelk_2dbc( A, n ) )
+                {
+                    chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
+                    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+                        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+                        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+                        INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn,
+                                            Wu(A->myrank, n), A(k, n) );
+                        RUNTIME_data_flush( sequence, A(k, n) );
+                    }
+                }
+                RUNTIME_data_flush( sequence, Wu(A->myrank, n) );
             }
             RUNTIME_perm_flushk( sequence, IPIV, k );
         }
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 8fb6734d3e15fe2cc25fb9c1664db8bc9a0f6987..b7e8f87b622c35d68f557f9c59393eabc017c679 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -67,6 +67,12 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
     ws->alg = ChamGetrfPPiv;
     ws->ib  = CHAMELEON_IB;
 
+#if defined (CHAMELEON_USE_MPI)
+    ws->proc_involved = malloc( sizeof( int ) * A->p );
+    ws->involved      = 0;
+    ws->np_involved   = 0;
+#endif
+
     {
         char *algostr = chameleon_getenv( "CHAMELEON_GETRF_ALGO" );
 
@@ -112,6 +118,11 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
                              A->m, A->n, 0, 0,
                              A->m, A->n, A->p, A->q,
                              NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg );
+        chameleon_desc_init( &(ws->Wu), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, A->mb, A->nb, A->mb*A->nb,
+                             A->mb * A->p * A->q, A->n, 0, 0,
+                             A->mb * A->p * A->q, A->n, A->p * A->q, 1,
+                             NULL, NULL, NULL, A->get_rankof_init_arg );
     }
 
     /* Set ib to 1 if per column algorithm */
@@ -160,6 +171,10 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws )
 {
     struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)user_ws;
 
+#if defined (CHAMELEON_USE_MPI)
+    free( ws->proc_involved );
+#endif
+
     if ( ( ws->alg == ChamGetrfNoPivPerColumn ) ||
          ( ws->alg == ChamGetrfPPiv           ) ||
          ( ws->alg == ChamGetrfPPivPerColumn  ) )
@@ -170,6 +185,11 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws )
     {
         chameleon_desc_destroy( &(ws->Up) );
     }
+    if ( ( ws->alg == ChamGetrfPPiv           ) ||
+         ( ws->alg == ChamGetrfPPivPerColumn  ) )
+    {
+        chameleon_desc_destroy( &(ws->Wu) );
+    }
     free( ws );
 }
 
diff --git a/control/compute_z.h b/control/compute_z.h
index 65d580ad9b0b10e40bd8ffdbc296fe833aa590fb..acb9599f295c77774acccef98c88d7db22c59362 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -43,13 +43,15 @@ struct chameleon_pzgemm_s {
  * @brief Data structure to handle the GETRF workspaces with partial pivoting
  */
 struct chameleon_pzgetrf_s {
-    cham_getrf_t alg;
-    int          ib;         /**< Internal blocking parameter */
-    int          batch_size; /**< Batch size for the panel    */
-    CHAM_desc_t  U;
-    CHAM_desc_t  Up;
-    int         *proc_involved;
-    unsigned int involved:1;
+    cham_getrf_t   alg;
+    int            ib;         /**< Internal blocking parameter */
+    int            batch_size; /**< Batch size for the panel    */
+    CHAM_desc_t    U;
+    CHAM_desc_t    Up; /**< Workspace used for the panel factorization    */
+    CHAM_desc_t    Wu; /**< Workspace used for the permutation and update */
+    int           *proc_involved;
+    unsigned int   involved;
+    int            np_involved;
 };
 
 /**
diff --git a/control/descriptor_helpers.c b/control/descriptor_helpers.c
index 9cae1883552fc8f418aca49140cf904dbcdcbed8..b49cb69e9b751e4494ae01de14571010c64e980c 100644
--- a/control/descriptor_helpers.c
+++ b/control/descriptor_helpers.c
@@ -100,6 +100,52 @@ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k ) {
     return ( myrank % A->q == k % A->q );
 }
 
+/**
+ * @brief Test if the current MPI process is involved in the panel k for 2DBC distributions.
+ *
+ * @param[in] A
+ *        The matrix descriptor.
+ *
+ * @param[in] k
+ *        The index of the panel to test.
+ *
+ * @param[in] n
+ *        The index of the panel to test.
+ *
+ * @param[inout] ws_getrf
+ *        The i.
+ *
+ */
+void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A,
+                                                 int                k,
+                                                 int                n,
+                                                 void              *ws_getrf )
+{
+#if defined (CHAMELEON_USE_MPI)
+    struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)ws_getrf;
+    int *proc_involved = ws->proc_involved;
+    int  b, rank, np;
+
+    np = 0;
+    ws->involved = 0;
+    for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) {
+        rank = chameleon_getrankof_2d( A, b, n );
+        proc_involved[ b-k ] = rank;
+        np ++;
+        if ( rank == A->myrank ) {
+            ws->involved = 1;
+        }
+    }
+    ws->proc_involved = proc_involved;
+    ws->np_involved   = np;
+#else
+    (void)A;
+    (void)k;
+    (void)n;
+    (void)ws_getrf;
+#endif
+}
+
 /**
  * @brief Initializes a custom distribution based on an external file.
  *
diff --git a/include/chameleon/descriptor_helpers.h b/include/chameleon/descriptor_helpers.h
index da79d04863f4180e6c6ce929fee6b33235998fc3..7bfdeb77ba565c2da0f9668f5d7d347e6e44112c 100644
--- a/include/chameleon/descriptor_helpers.h
+++ b/include/chameleon/descriptor_helpers.h
@@ -64,6 +64,10 @@ int chameleon_getrankof_custom        ( const CHAM_desc_t *A, int m, int n );
  */
 
 int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int An );
+void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A,
+                                                 int                k,
+                                                 int                n,
+                                                 void              *ws_getrf );
 
 /**
  * @}
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 236482682266032654bcc6a8e6050b617134fa98..5f1bbcd322293e3104fca313974096bcf711de71 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -199,17 +199,17 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
                              const CHAM_desc_t *tileA, int tileAm, int tileAn,
                              const CHAM_desc_t *tileB, int tileBm, int tileBn );
 void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
-                                 int m0, int minmn, int k, int m, int n,
+                                 int m0, int minmn,
                                  void *ws,
                                  const CHAM_ipiv_t *ipiv, int ipivk,
-                                 const CHAM_desc_t *A,
-                                 const CHAM_desc_t *U,
+                                 const CHAM_desc_t *Am, int Amm, int Amn,
+                                 const CHAM_desc_t *Ak, int Akm, int Akn,
+                                 const CHAM_desc_t *U,  int Um,  int Un,
                                  void **clargs_ptr );
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
-                                       int k, int n,
                                        const CHAM_ipiv_t *ipiv, int ipivk,
-                                       const CHAM_desc_t *A,
-                                       const CHAM_desc_t *U,
+                                       const CHAM_desc_t *Ak, int Akm, int Akn,
+                                       const CHAM_desc_t *U,  int Um,  int Un,
                                        void **clargs_ptr );
 void INSERT_TASK_zlatro( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
@@ -583,4 +583,181 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
                                   int                     h,
                                   int                     n );
 
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ *  INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile
+ * U(Um, Un) according to the permutation ipiv. This task is used in the LU
+ * factorization with partial pivoting.
+ *
+ *******************************************************************************
+ *
+ * @param[in] options
+ *          The runtime options data structure to pass through all insert_task calls.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] ipiv
+ *          The pivot structure that contains the informations for the LU
+ *          factorization with partial pivoting.
+ *
+ * @param[in] ipivk
+ *          The index of the permutation.
+ *
+ * @param[in] k
+ *          The number of rows in the tile U(Um, Un).
+ *
+ * @param[in] n
+ *          The number of columns in the tile U(Um, Un).
+ *
+ * @param[inout] U
+ *          The descriptor of the worskpace used for the permutation in the LU
+ *          factorization with partial pivoting.
+ *
+ * @param[in] Um
+ *          The row index of the tile used in U.
+ *
+ * @param[in] Un
+ *          The column index of the tile used in U.
+ *
+ * @param[in] ws
+ *          The workspace to handle the data in the LU factorization with
+ *          partial pivoting.
+ *
+ *******************************************************************************
+ */
+void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                                  const CHAM_desc_t      *A,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     ipivk,
+                                  int                     k,
+                                  int                     n,
+                                  CHAM_desc_t            *U,
+                                  int                     Um,
+                                  int                     Un,
+                                  void                   *ws );
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ *  INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus
+ * involved in the permutation. This task is used in the LU factorization with
+ * partial pivoting.
+ *
+ *******************************************************************************
+ *
+ * @param[in] options
+ *          The runtime options data structure to pass through all insert_task calls.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] Am
+ *          The row index of the tile used in A.
+ *
+ * @param[in] An
+ *          The column index of the tile used in A.
+ *
+ * @param[in] myrank
+ *          The rank of the current process.
+ *
+ * @param[in] np
+ *          The number of processus involved in the permutation.
+ *
+ * @param[in] proc_involved
+ *          The list of the processus involved in the permutation.
+ *
+ *******************************************************************************
+ */
+void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                         CHAM_desc_t            *A,
+                                         int                     Am,
+                                         int                     An,
+                                         int                     myrank,
+                                         int                     np,
+                                         int                    *proc_involved );
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ *  INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the
+ * processus involved in the permutation. This task is used in the LU
+ * factorization with partial pivoting.
+ *
+ *******************************************************************************
+ *
+ * @param[in] options
+ *          The runtime options data structure to pass through all insert_task calls.
+ *
+ * @param[in] ipiv
+ *          The pivot structure that contains the informations for the LU
+ *          factorization with partial pivoting.
+ *
+ * @param[in] ipivk
+ *          The index of the permutation.
+ *
+ * @param[in] myrank
+ *          The rank of the current process.
+ *
+ * @param[in] np
+ *          The number of processus involved in the permutation.
+ *
+ * @param[in] proc_involved
+ *          The list of the processus involved in the permutation.
+ *
+ *******************************************************************************
+ */
+void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                            CHAM_ipiv_t            *ipiv,
+                                            int                     ipivk,
+                                            int                     myrank,
+                                            int                     np,
+                                            int                    *proc_involved );
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ *  INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk
+ * to the processus involved in the permutation. This task is used in the LU
+ * factorization with partial pivoting.
+ *
+ *******************************************************************************
+ *
+ * @param[in] options
+ *          The runtime options data structure to pass through all insert_task calls.
+ *
+ * @param[in] ipiv
+ *          The pivot structure that contains the informations for the LU
+ *          factorization with partial pivoting.
+ *
+ * @param[in] ipivk
+ *          The index of the permutation.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] k
+ *          The index of the panel factorized.
+ *
+ * @param[in] n
+ *          The index of the panel to permute.
+ *
+ *******************************************************************************
+ */
+void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                            CHAM_ipiv_t            *ipiv,
+                                            int                     ipivk,
+                                            const CHAM_desc_t      *A,
+                                            int                     k,
+                                            int                     n );
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 6b24081b2bd7f58f330e28b142f8c714ba208009..e46fd45b105edfcf96d1bccb2a6780f481a1a9a7 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -86,6 +86,7 @@ set(CODELETS_ZSRC
     codelets/codelet_zlaswp_batched.c
     codelets/codelet_zlatro.c
     codelets/codelet_zlauum.c
+    codelets/codelet_zperm_allreduce.c
     codelets/codelet_zplghe.c
     codelets/codelet_zplgsy.c
     codelets/codelet_zplrnt.c
diff --git a/runtime/openmp/codelets/codelet_zlaswp_batched.c b/runtime/openmp/codelets/codelet_zlaswp_batched.c
index 49ac5381ca1d1e4fe3bfb562811675b3d909765b..07fd1eab85abeb6913936ef7980fec919fb03443 100644
--- a/runtime/openmp/codelets/codelet_zlaswp_batched.c
+++ b/runtime/openmp/codelets/codelet_zlaswp_batched.c
@@ -21,45 +21,57 @@
 void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
                                  int                     m0,
                                  int                     minmn,
-                                 int                     k,
-                                 int                     m,
-                                 int                     n,
                                  void                   *ws,
                                  const CHAM_ipiv_t      *ipiv,
                                  int                     ipivk,
-                                 const CHAM_desc_t      *A,
-                                 const CHAM_desc_t      *Wu,
+                                 const CHAM_desc_t      *Am,
+                                 int                     Amm,
+                                 int                     Amn,
+                                 const CHAM_desc_t      *Ak,
+                                 int                     Akm,
+                                 int                     Akn,
+                                 const CHAM_desc_t      *U,
+                                 int                     Um,
+                                 int                     Un,
                                  void                  **clargs_ptr )
 {
     (void)options;
     (void)m0;
     (void)minmn;
-    (void)k;
-    (void)m;
-    (void)n;
     (void)ws;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
-    (void)Wu;
+    (void)Am;
+    (void)Amm;
+    (void)Amn;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
 
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
-                                       int                     k,
-                                       int                     n,
                                        const CHAM_ipiv_t      *ipiv,
                                        int                     ipivk,
-                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *Ak,
+                                       int                     Akm,
+                                       int                     Akn,
                                        const CHAM_desc_t      *U,
+                                       int                     Um,
+                                       int                     Un,
                                        void                  **clargs_ptr )
 {
     (void)options;
-    (void)k;
-    (void)n;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
     (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..cb77c806bcb8ce47a62e7b4e19b2dad3dafc8218
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c
@@ -0,0 +1,93 @@
+/**
+ *
+ * @file openmp/codelet_zperm_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon openmp codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+
+void
+INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                    CHAM_desc_t            *A,
+                                    int                     Am,
+                                    int                     An,
+                                    int                     myrank,
+                                    int                     np,
+                                    int                    *proc_involved  )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       int                     myrank,
+                                       int                     np,
+                                       int                    *proc_involved  )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       int                     k,
+                                       int                     n )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)k;
+    (void)n;
+}
+
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             void                   *ws )
+{
+    (void)options;
+    (void)A;
+    (void)ipiv;
+    (void)ipivk;
+    (void)k;
+    (void)n;
+    (void)U;
+    (void)Um;
+    (void)Un;
+    (void)ws;
+}
diff --git a/runtime/parsec/codelets/codelet_zlaswp_batched.c b/runtime/parsec/codelets/codelet_zlaswp_batched.c
index aa8726690b25d23b6cdd3ea6ff525b9c36be12d3..011d42e8b2359ba7ffbfb9a8022b2c18c9b8e8e0 100644
--- a/runtime/parsec/codelets/codelet_zlaswp_batched.c
+++ b/runtime/parsec/codelets/codelet_zlaswp_batched.c
@@ -21,45 +21,57 @@
 void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
                                  int                     m0,
                                  int                     minmn,
-                                 int                     k,
-                                 int                     m,
-                                 int                     n,
                                  void                   *ws,
                                  const CHAM_ipiv_t      *ipiv,
                                  int                     ipivk,
-                                 const CHAM_desc_t      *A,
-                                 const CHAM_desc_t      *Wu,
+                                 const CHAM_desc_t      *Am,
+                                 int                     Amm,
+                                 int                     Amn,
+                                 const CHAM_desc_t      *Ak,
+                                 int                     Akm,
+                                 int                     Akn,
+                                 const CHAM_desc_t      *U,
+                                 int                     Um,
+                                 int                     Un,
                                  void                  **clargs_ptr )
 {
     (void)options;
     (void)m0;
     (void)minmn;
-    (void)k;
-    (void)m;
-    (void)n;
     (void)ws;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
-    (void)Wu;
+    (void)Am;
+    (void)Amm;
+    (void)Amn;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
 
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
-                                       int                     k,
-                                       int                     n,
                                        const CHAM_ipiv_t      *ipiv,
                                        int                     ipivk,
-                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *Ak,
+                                       int                     Akm,
+                                       int                     Akn,
                                        const CHAM_desc_t      *U,
+                                       int                     Um,
+                                       int                     Un,
                                        void                  **clargs_ptr )
 {
     (void)options;
-    (void)k;
-    (void)n;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
     (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..30890f8114b857b7c12804c526f4aa4c875b63a1
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c
@@ -0,0 +1,93 @@
+/**
+ *
+ * @file parsec/codelet_zperm_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon parsec codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+
+void
+INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                    CHAM_desc_t            *A,
+                                    int                     Am,
+                                    int                     An,
+                                    int                     myrank,
+                                    int                     np,
+                                    int                    *proc_involved  )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       int                     myrank,
+                                       int                     np,
+                                       int                    *proc_involved  )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       int                     k,
+                                       int                     n )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)k;
+    (void)n;
+}
+
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             void                   *ws )
+{
+    (void)options;
+    (void)A;
+    (void)ipiv;
+    (void)ipivk;
+    (void)k;
+    (void)n;
+    (void)U;
+    (void)Um;
+    (void)Un;
+    (void)ws;
+}
diff --git a/runtime/quark/codelets/codelet_zlaswp_batched.c b/runtime/quark/codelets/codelet_zlaswp_batched.c
index f96414f27d29f448b7856d1e913e42cc4e15fcff..9ec2148fbe51cbf9cd168c033ac673add97141a2 100644
--- a/runtime/quark/codelets/codelet_zlaswp_batched.c
+++ b/runtime/quark/codelets/codelet_zlaswp_batched.c
@@ -21,45 +21,57 @@
 void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
                                  int                     m0,
                                  int                     minmn,
-                                 int                     k,
-                                 int                     m,
-                                 int                     n,
                                  void                   *ws,
                                  const CHAM_ipiv_t      *ipiv,
                                  int                     ipivk,
-                                 const CHAM_desc_t      *A,
-                                 const CHAM_desc_t      *Wu,
+                                 const CHAM_desc_t      *Am,
+                                 int                     Amm,
+                                 int                     Amn,
+                                 const CHAM_desc_t      *Ak,
+                                 int                     Akm,
+                                 int                     Akn,
+                                 const CHAM_desc_t      *U,
+                                 int                     Um,
+                                 int                     Un,
                                  void                  **clargs_ptr )
 {
     (void)options;
     (void)m0;
     (void)minmn;
-    (void)k;
-    (void)m;
-    (void)n;
     (void)ws;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
-    (void)Wu;
+    (void)Am;
+    (void)Amm;
+    (void)Amn;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
 
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
-                                       int                     k,
-                                       int                     n,
                                        const CHAM_ipiv_t      *ipiv,
                                        int                     ipivk,
-                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *Ak,
+                                       int                     Akm,
+                                       int                     Akn,
                                        const CHAM_desc_t      *U,
+                                       int                     Um,
+                                       int                     Un,
                                        void                  **clargs_ptr )
 {
     (void)options;
-    (void)k;
-    (void)n;
     (void)ipiv;
     (void)ipivk;
-    (void)A;
+    (void)Ak;
+    (void)Akm;
+    (void)Akn;
     (void)U;
+    (void)Um;
+    (void)Un;
     (void)clargs_ptr;
 }
diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..52281451dd038a9276a2040b9f4c08f7effa63f7
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zperm_allreduce.c
@@ -0,0 +1,93 @@
+/**
+ *
+ * @file quark/codelet_zperm_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon quark codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+
+void
+INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                    CHAM_desc_t            *A,
+                                    int                     Am,
+                                    int                     An,
+                                    int                     myrank,
+                                    int                     np,
+                                    int                    *proc_involved  )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       int                     myrank,
+                                       int                     np,
+                                       int                    *proc_involved  )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       int                     k,
+                                       int                     n )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)k;
+    (void)n;
+}
+
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             void                   *ws )
+{
+    (void)options;
+    (void)A;
+    (void)ipiv;
+    (void)ipivk;
+    (void)k;
+    (void)n;
+    (void)U;
+    (void)Um;
+    (void)Un;
+    (void)ws;
+}
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
index ade365c68ff52757a11b9c8077d14ce28e7208d0..96d3108a89b74e67fb31d892b974b0d2d1d7e3a7 100644
--- a/runtime/starpu/codelets/codelet_zlaswp.c
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -47,6 +47,9 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
                              const CHAM_desc_t *U, int Um, int Un )
 {
     struct starpu_codelet *codelet = &cl_zlaswp_get;
+    if ( A->get_rankof( A, Am, An) != A->myrank ) {
+        return;
+    }
 
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL;
 
@@ -91,6 +94,9 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
                              const CHAM_desc_t *B, int Bm, int Bn )
 {
     struct starpu_codelet *codelet = &cl_zlaswp_set;
+    if ( A->get_rankof( B, Bm, Bn) != A->myrank ) {
+        return;
+    }
 
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL;
 
diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c
index 6af43659c2768c2443684de411297fab9a68e003..b17f26a486dc87e5d8dcb807369bfa431e809b06 100644
--- a/runtime/starpu/codelets/codelet_zlaswp_batched.c
+++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c
@@ -57,21 +57,25 @@ CODELETS_CPU( zlaswp_batched, cl_zlaswp_batched_cpu_func )
 void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
                                  int                     m0,
                                  int                     minmn,
-                                 int                     k,
-                                 int                     m,
-                                 int                     n,
                                  void                   *ws,
                                  const CHAM_ipiv_t      *ipiv,
                                  int                     ipivk,
-                                 const CHAM_desc_t      *A,
-                                 const CHAM_desc_t      *Wu,
+                                 const CHAM_desc_t      *Am,
+                                 int                     Amm,
+                                 int                     Amn,
+                                 const CHAM_desc_t      *Ak,
+                                 int                     Akm,
+                                 int                     Akn,
+                                 const CHAM_desc_t      *U,
+                                 int                     Um,
+                                 int                     Un,
                                  void                  **clargs_ptr )
 {
     int task_num   = 0;
     int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
     int nhandles;
     struct cl_laswp_batched_args_t *clargs = *clargs_ptr;
-    if ( A->get_rankof( A, m, n) != A->myrank ) {
+    if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) {
         return;
     }
 
@@ -84,7 +88,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
 
     task_num               = clargs->tasks_nbr;
     clargs->m0[ task_num ] = m0;
-    clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, m, n);
+    clargs->handle_mode[ task_num ].handle = RTBLKADDR(Am, CHAMELEON_Complex64_t, Amm, Amn);
     clargs->handle_mode[ task_num ].mode   = STARPU_RW;
     clargs->tasks_nbr ++;
 
@@ -95,8 +99,8 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
             STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
             STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
             STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
-            STARPU_RW | STARPU_COMMUTE, RTBLKADDR(Wu, ChamComplexDouble, A->myrank, n),
-            STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, k, n),
+            STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
+            STARPU_R,                   RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn),
             STARPU_DATA_MODE_ARRAY,     clargs->handle_mode, nhandles,
             STARPU_PRIORITY,            options->priority,
             STARPU_EXECUTE_ON_WORKER,   options->workerid,
@@ -108,12 +112,14 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
-                                       int                     k,
-                                       int                     n,
                                        const CHAM_ipiv_t      *ipiv,
                                        int                     ipivk,
-                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *Ak,
+                                       int                     Akm,
+                                       int                     Akn,
                                        const CHAM_desc_t      *U,
+                                       int                     Um,
+                                       int                     Un,
                                        void                  **clargs_ptr )
 {
     struct cl_laswp_batched_args_t *clargs   = *clargs_ptr;
@@ -129,8 +135,8 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
         STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
         STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
         STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
-        STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, k, n),
-        STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, k, n),
+        STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
+        STARPU_R,                   RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn),
         STARPU_DATA_MODE_ARRAY,     clargs->handle_mode, nhandles,
         STARPU_PRIORITY,            options->priority,
         STARPU_EXECUTE_ON_WORKER,   options->workerid,
diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..ab9cf702294f7a54348b8d7995f45aca5afc32e3
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c
@@ -0,0 +1,307 @@
+/**
+ *
+ * @file starpu/codelet_zperm_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu_internal.h"
+#include "runtime_codelet_z.h"
+#include <coreblas/cblas_wrapper.h>
+
+#if defined(CHAMELEON_USE_MPI)
+struct cl_redux_args_t {
+    int tempmm;
+    int n;
+    int p;
+    int q;
+    int p_first;
+    int me;
+    int shift;
+    int np_inv;
+};
+
+static void
+cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_redux_args_t      *clargs     = (struct cl_redux_args_t *) cl_arg;
+    const CHAM_tile_t           *tileUinout = cti_interface_get( descr[0] );
+    const CHAM_tile_t           *tileUin    = cti_interface_get( descr[1] );
+    const int                   *perm       = (int *)STARPU_VECTOR_GET_PTR( descr[2] );
+    CHAMELEON_Complex64_t       *Uinout     = CHAM_tile_get_ptr( tileUinout );
+    const CHAMELEON_Complex64_t *Uin        = CHAM_tile_get_ptr( tileUin );
+
+    int tempmm  = clargs->tempmm;
+    int n       = clargs->n;
+    int p       = clargs->p;
+    int q       = clargs->q;
+    int p_first = clargs->p_first / q;
+    int shift   = clargs->shift;
+    int np      = clargs->np_inv;
+    int me      = ( p <= np ) ? clargs->me / q : ( ( clargs->me / q ) - p_first + p ) % p;
+    int nb      = tileUinout->n;
+    int mb      = tileUinout->m;
+    int first   = me - 2 * shift + 1;
+    int last    = me -     shift;
+    int i, m, ownerp;
+
+    for ( i = 0; i < tempmm; i++ ) {
+        m      = perm[ i ] / mb;
+        ownerp = ( p <= np ) ? ( (m % p) * q + (n % q) ) / q : ( ( (m % p) * q + (n % q) ) / q - p_first + p ) % p;
+
+        if ( ( (first    <= ownerp) && (ownerp <= last   ) ) ||
+             ( (first+np <= ownerp) && (ownerp <= last+np) ) )
+        {
+            cblas_zcopy( nb, Uin    + i, tileUin->ld,
+                             Uinout + i, tileUinout->ld );
+        }
+    }
+}
+
+CODELETS_CPU( zperm_allreduce, cl_zperm_allreduce_cpu_func )
+
+static void
+INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *U,
+                                  int                     me,
+                                  int                     dst,
+                                  int                     n )
+{
+    rt_starpu_insert_task(
+        NULL,
+        STARPU_EXECUTE_ON_NODE, dst,
+        STARPU_R,               RTBLKADDR(U, CHAMELEON_Complex64_t, me, n),
+        STARPU_PRIORITY,        options->priority,
+        0 );
+}
+
+static void
+INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *U,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     ipivk,
+                                  int                     me,
+                                  int                     src,
+                                  int                     n,
+                                  int                     tempmm,
+                                  int                     p,
+                                  int                     q,
+                                  int                     shift,
+                                  int                     np,
+                                  int                     p_first )
+{
+    struct cl_redux_args_t *clargs;
+    clargs = malloc( sizeof( struct cl_redux_args_t ) );
+    clargs->tempmm = tempmm;
+    clargs->n      = n;
+    clargs->p      = p;
+    clargs->q      = q;
+    clargs->p_first  = p_first;
+    clargs->me     = me;
+    clargs->shift  = shift;
+    clargs->np_inv = np;
+
+    rt_starpu_insert_task(
+        &cl_zperm_allreduce,
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_t),
+        STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, me,  n),
+        STARPU_R,                 RTBLKADDR(U, CHAMELEON_Complex64_t, src, n),
+        STARPU_R,                 RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_EXECUTE_ON_NODE,   me,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_PRIORITY,          options->priority,
+        0 );
+    starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) );
+}
+
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             void                   *ws )
+{
+    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws;
+    int *proc_involved = tmp->proc_involved;
+    int  np_involved   = chameleon_min( A->p, A->mt - k);
+    int  np_iter       = np_involved;
+    int  p_recv, p_send, me, p_first;
+    int  shift = 1;
+
+    if ( np_involved == 1 ) {
+        assert( proc_involved[0] == A->myrank );
+    }
+    else {
+        p_first = proc_involved[0];
+        for( me = 0; me < np_involved; me++ ) {
+            if ( proc_involved[me] == A->myrank ) {
+                break;
+            }
+        }
+        assert( me < np_involved );
+        while ( np_iter > 1 ) {
+            p_send = proc_involved[ ( me + shift               ) % np_involved ];
+            p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ];
+
+            INSERT_TASK_zperm_allreduce_send( options, U, A->myrank, p_send, n );
+            INSERT_TASK_zperm_allreduce_recv( options, U, ipiv, ipivk, A->myrank, p_recv,
+                                              n, k == (A->mt-1) ? A->m - k * A->mb : A->mb,
+                                              A->p, A->q, shift, np_involved, p_first );
+
+            shift   = shift << 1;
+            np_iter = chameleon_ceil( np_iter, 2 );
+        }
+    }
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                    CHAM_desc_t            *A,
+                                    int                     Am,
+                                    int                     An,
+                                    int                     myrank,
+                                    int                     np,
+                                    int                    *proc_involved )
+{
+    int p, rank;
+
+    for ( p = 0; p < np; p ++ ) {
+        if ( proc_involved[ p ] == myrank ) {
+            continue;
+        }
+        starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                              RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+                                              proc_involved[ p ], NULL, NULL );
+    }
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       int                     myrank,
+                                       int                     np,
+                                       int                    *proc_involved )
+{
+    int p;
+
+    for ( p = 0; p < np; p++ ) {
+        if ( proc_involved[ p ] == myrank ) {
+            continue;
+        }
+        starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                              RUNTIME_perm_getaddr( ipiv, ipivk ),
+                                              proc_involved[ p ], NULL, NULL );
+    }
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       int                     k,
+                                       int                     n )
+{
+    int b, rank;
+
+    for ( b = k+1; (b < A->mt) && ((b-(k+1)) < A->p); b ++ ) {
+        rank = A->get_rankof( A, b, n );
+        if ( rank == A->myrank ) {
+            continue;
+        }
+        starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                              RUNTIME_invp_getaddr( ipiv, ipivk ),
+                                              rank, NULL, NULL );
+    }
+}
+#else
+void
+INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
+                                    CHAM_desc_t            *A,
+                                    int                     Am,
+                                    int                     An,
+                                    int                     myrank,
+                                    int                     np,
+                                    int                    *proc_involved  )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       int                     myrank,
+                                       int                     np,
+                                       int                    *proc_involved  )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)myrank;
+    (void)np;
+    (void)proc_involved;
+}
+
+void
+INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       int                     k,
+                                       int                     n )
+{
+    (void)options;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)k;
+    (void)n;
+}
+
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             void                   *ws )
+{
+    (void)options;
+    (void)A;
+    (void)ipiv;
+    (void)ipivk;
+    (void)k;
+    (void)n;
+    (void)U;
+    (void)Um;
+    (void)Un;
+    (void)ws;
+}
+#endif