diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 795ebd2d186f9c1e88a44ab6312d40583b1a4d5d..93a5f6e303c8b00076e78fbe7faf58fc59dfe4f7 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -562,4 +562,12 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
                                       CHAM_desc_t *U, int Um, int Un,
                                       CHAM_ipiv_t *ws );
 
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n );
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index f7203fbe500d517ea64251ea198600944ce9291c..08279345b7f5d95ba633f3143f7c1b39fe2d6352 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -24,6 +24,7 @@
 #  @author Florent Pruvost
 #  @author Philippe Virouleau
 #  @author Matthieu Kuhn
+#  @author Alycia Lisito
 #  @date 2024-03-16
 #
 ###
@@ -73,6 +74,7 @@ set(CODELETS_ZSRC
     codelets/codelet_zhe2ge.c
     codelets/codelet_zherfb.c
     codelets/codelet_zhessq.c
+    codelets/codelet_zipiv_allreduce.c
     codelets/codelet_zlacpy.c
     codelets/codelet_zlange.c
     codelets/codelet_zlanhe.c
diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..9856258bba33499b06156fa83c2ceea00e0f6868
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
@@ -0,0 +1,169 @@
+/**
+ *
+ * @file starpu/codelet_zipiv_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+#include <coreblas/cblas_wrapper.h>
+
+#if defined ( CHAMELEON_USE_MPI )
+struct cl_redux_args_t {
+    int h;
+    int n;
+    int k;
+};
+
+static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_redux_args_t *clargs      = (struct cl_redux_args_t *) cl_arg;
+    cppi_interface_t       *cppi_me     = ((cppi_interface_t *) descr[0]);
+    cppi_interface_t       *cppi_src    = ((cppi_interface_t *) descr[1]);
+    CHAM_pivot_t           *nextpiv_me  = &(cppi_me->pivot);
+    CHAM_pivot_t           *nextpiv_src = &(cppi_src->pivot);
+    CHAMELEON_Complex64_t  *pivrow_me   = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow);
+    CHAMELEON_Complex64_t  *pivrow_src  = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow);
+
+    cppi_display_dbg( cppi_me,  stderr, "Global redux Inout: ");
+    cppi_display_dbg( cppi_src, stderr, "Global redux Input: ");
+
+    assert( cppi_me->n         == cppi_src->n         );
+    assert( cppi_me->h         == cppi_src->h         );
+    assert( cppi_me->flttype   == cppi_src->flttype   );
+    assert( cppi_me->arraysize == cppi_src->arraysize );
+
+    if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) {
+        nextpiv_me->blkm0  = nextpiv_src->blkm0;
+        nextpiv_me->blkidx = nextpiv_src->blkidx;
+        cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 );
+    }
+
+    /* Let's copy the diagonal row if needed */
+    if ( ( cppi_src->has_diag == 1 ) &&
+         ( cppi_me->has_diag  == -1 ) )
+    {
+        cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 );
+        assert( cppi_src->arraysize == clargs->n * sizeof(CHAMELEON_Complex64_t) );
+        cppi_me->has_diag = 1;
+    }
+
+    cppi_display_dbg( cppi_me,  stderr, "Global redux Inout(After): ");
+}
+
+CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func )
+
+void
+INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv,
+                                  int          me,
+                                  int          dst,
+                                  int          k,
+                                  int          h,
+                                  const RUNTIME_option_t *options )
+{
+    rt_starpu_insert_task(
+        NULL,
+        STARPU_EXECUTE_ON_NODE, dst,
+        STARPU_R,               RUNTIME_pivot_getaddr( ipiv, me, k, h ),
+        STARPU_PRIORITY,        options->priority,
+        0 );
+}
+
+void
+INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv,
+                                  int          me,
+                                  int          src,
+                                  int          k,
+                                  int          h,
+                                  int          n,
+                                  const RUNTIME_option_t *options )
+{
+    struct cl_redux_args_t *clargs;
+    clargs = malloc( sizeof( struct cl_redux_args_t ) );
+    clargs->h = h;
+    clargs->n = n;
+    clargs->k = k;
+
+    rt_starpu_insert_task(
+        &cl_zipiv_allreduce,
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_t),
+        STARPU_RW,                RUNTIME_pivot_getaddr( ipiv, me,  k, h ),
+        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, src, k, h ),
+        STARPU_EXECUTE_ON_NODE,   me,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_PRIORITY,          options->priority,
+        0 );
+    starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) );
+}
+
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    int np_involved   = chameleon_min( A->p, A->mt - k);
+    int np_iter       = np_involved;
+    int p_recv, p_send, me;
+    int shift = 1;
+
+    if ( h > 0 ) {
+        starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) );
+    }
+    if ( h >= ipiv->n ) {
+        return;
+    }
+
+    if ( np_involved == 1 ) {
+        assert( proc_involved[0] == A->myrank );
+    }
+    else {
+        for( me = 0; me < np_involved; me++ ) {
+            if ( proc_involved[me] == A->myrank ) {
+                break;
+            }
+        }
+        assert( me < np_involved );
+        while ( np_iter > 1 ) {
+            p_send = proc_involved[ ( me + shift               ) % np_involved ];
+            p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ];
+
+            INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h,    options );
+            INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options );
+
+            shift   = shift << 1;
+            np_iter = chameleon_ceil( np_iter, 2 );
+        }
+    }
+}
+#else
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    if ( h > 0 ) {
+        starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) );
+    }
+
+    (void)options;
+    (void)proc_involved;
+    (void)n;
+}
+#endif
diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h
index 7dbd10118c6bee637c9c49b6bc5bdf9d3fc008e2..8113c453fb2d344bda5972430e9e8337a3b0f880 100644
--- a/runtime/starpu/include/cppi_interface.h
+++ b/runtime/starpu/include/cppi_interface.h
@@ -12,6 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
@@ -44,6 +45,7 @@ struct cppi_interface_s
 void cppi_interface_init();
 void cppi_interface_fini();
 
+CHAM_pivot_t *cppi_handle_get( starpu_data_handle_t handle );
 void cppi_register( starpu_data_handle_t *handleptr,
                     cham_flttype_t        flttype,
                     int                   n,
@@ -61,13 +63,14 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title )
     diagrow = cppi_interface->pivot.diagrow;
     pivrow  = cppi_interface->pivot.pivrow;
 
-    fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d\n",
+    fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d, interf = %p\n",
              title,
              cppi_interface->n,
              cppi_interface->h,
              cppi_interface->has_diag,
              cppi_interface->pivot.blkm0,
-             cppi_interface->pivot.blkidx );
+             cppi_interface->pivot.blkidx,
+             cppi_interface );
 
     fprintf(stderr, "Diagonal row: " );
     for( i=0; i<cppi_interface->n; i++) {
diff --git a/runtime/starpu/interface/cppi_interface.c b/runtime/starpu/interface/cppi_interface.c
index 2d1754ec1cee030c040ee1c941cf283a6f58b284..6b1f8063180e78dbebf1ea443ee28f98920a7723 100644
--- a/runtime/starpu/interface/cppi_interface.c
+++ b/runtime/starpu/interface/cppi_interface.c
@@ -12,13 +12,14 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
 #include "chameleon_starpu.h"
 #undef HAVE_STARPU_REUSE_DATA_ON_NODE
 
-static inline CHAM_pivot_t *
+CHAM_pivot_t *
 cppi_handle_get( starpu_data_handle_t handle )
 {
     cppi_interface_t *cppi_interface = (cppi_interface_t *)
@@ -38,7 +39,7 @@ cppi_init( void *data_interface )
     cppi_interface_t *cppi_interface = (cppi_interface_t *)data_interface;
     cppi_interface->id = CPPI_INTERFACE_ID;
     cppi_interface->h  = -1;
-    cppi_interface->has_diag = 0;
+    cppi_interface->has_diag = -1;
 }
 
 static void
@@ -83,7 +84,7 @@ cppi_allocate_data_on_node( void *data_interface, unsigned node )
 
     /* update the data properly in consequence */
     cppi_interface->h = -1;
-    cppi_interface->has_diag = 0;
+    cppi_interface->has_diag = -1;
     cppi_interface->pivot.pivrow  = dataptr;
     cppi_interface->pivot.diagrow = ((char*)dataptr) + cppi_interface->arraysize;
 
@@ -279,8 +280,10 @@ cppi_describe( void *data_interface, char *buf, size_t size )
 {
     cppi_interface_t *cppi_interface = (cppi_interface_t *) data_interface;
 
-    return snprintf( buf, size, "Pivot structure, n %d, blkm0 %d, blkidx %d",
+    return snprintf( buf, size, "Pivot structure, n %d, h %d, has_diag = %d, blkm0 %d, blkidx %d",
                      cppi_interface->n,
+                     cppi_interface->h,
+                     cppi_interface->has_diag,
                      cppi_interface->pivot.blkm0,
                      cppi_interface->pivot.blkidx );
 }
@@ -298,6 +301,7 @@ cppi_copy_any_to_any( void *src_interface, unsigned src_node,
     STARPU_ASSERT( cppi_interface_src->flttype == cppi_interface_dst->flttype );
 
     cppi_interface_dst->h            = cppi_interface_src->h;
+    cppi_interface_dst->has_diag     = cppi_interface_src->has_diag;
     cppi_interface_dst->pivot.blkm0  = cppi_interface_src->pivot.blkm0;
     cppi_interface_dst->pivot.blkidx = cppi_interface_src->pivot.blkidx;
 
@@ -402,8 +406,8 @@ cl_cppi_redux_cpu_func(void *descr[], void *cl_arg)
     assert( cppi_redux->h == cppi_input->h );
 
     /* Let's copy the diagonal row if needed */
-    if ( cppi_input->has_diag ) {
-        assert( cppi_redux->has_diag == 0 );
+    if ( cppi_input->has_diag == 1 ) {
+        assert( cppi_redux->has_diag == -1 );
 
         memcpy( cppi_redux->pivot.diagrow,
                 cppi_input->pivot.diagrow,
@@ -449,7 +453,7 @@ cl_cppi_init_redux_cpu_func( void *descr[], void *cl_arg )
     cppi_interface_t *cppi_redux = ((cppi_interface_t *) descr[0]);
 
     /* Redux pivot never has diagonal at initialization */
-    cppi_redux->has_diag = 0;
+    cppi_redux->has_diag = -1;
     cppi_redux->h        = -1;
 
     size_t size = cppi_redux->arraysize;
@@ -497,7 +501,7 @@ cppi_register( starpu_data_handle_t *handleptr,
             .id = CPPI_INTERFACE_ID,
             .arraysize = n * CHAMELEON_Element_Size( flttype ),
             .flttype = flttype,
-            .has_diag = 0,
+            .has_diag = -1,
             .h  = -1,
             .n  = n,
         };