diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index b90df480a365381097b434ad2d09ad414131b9af..afd17c16f2a60d1b5cb35616151072872fdb3de2 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -51,6 +51,7 @@ _extra_blas = [
     ('',                     'sprint',               'dprint',               'cprint',               'zprint'              ),
     ('',                     'sgered',               'dgered',               'cgered',               'zgered'              ),
     ('',                     'sgerst',               'dgerst',               'cgerst',               'zgerst'              ),
+    ('',                     'sipiv_allreduce',      'dipiv_allreduce',      'cipiv_allreduce',      'zipiv_allreduce'     ),
 ]
 
 _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ]
diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index e6f3d107a911b0a29ed40fffe204f7f84ad5d259..a56e1c9a1c5220acf8ae09db0715d4d331e99ffe 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -16,6 +16,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  * @precisions normal z -> s d c
  *
@@ -146,15 +147,13 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
                 ipiv );
         }
 
-        if ( h < minmn ) {
-            /* Reduce globally (between MPI processes) */
-            INSERT_TASK_ipiv_reducek( options, ipiv, k, h );
-        }
+        /* Reduce globally (between MPI processes) */
+        INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
     }
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 /*
@@ -195,17 +194,14 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
         }
         INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv );
 
-        if ( h < minmn ) {
-            /* Reduce globally (between MPI processes) */
-            INSERT_TASK_ipiv_reducek( options, ipiv, k, h );
-        }
+        INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
     }
 
     free( clargs );
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 static inline void
@@ -218,6 +214,10 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
     int m, h, b, nbblock;
     int tempkm, tempkn, tempmm, minmn;
 
+    if ( ! ws->involved ) {
+        return;
+    }
+
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
     minmn  = chameleon_min( tempkm, tempkn );
@@ -233,7 +233,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
         int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib;
 
         for (h=0; h<hmax; h++){
-            int j =  h + b * ws->ib;
+            int j = h + b * ws->ib;
 
             INSERT_TASK_zgetrf_blocked_diag(
                 options,
@@ -250,26 +250,24 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
                     ipiv );
             }
 
-            if ( (b < (nbblock-1)) && (h == hmax-1) ) {
+            assert( j <= minmn );
+            /* Reduce globally (between MPI processes) */
+            INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
+
+            if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) {
                 INSERT_TASK_zgetrf_blocked_trsm(
                     options,
-                    ws->ib, tempkn, b * ws->ib + hmax, ws->ib,
+                    ws->ib, tempkn, j+1, ws->ib,
                     Up(k, k),
                     ipiv );
             }
-
-            assert( j<= minmn );
-            if ( j < minmn ) {
-                /* Reduce globally (between MPI processes) */
-                INSERT_TASK_ipiv_reducek( options, ipiv, k, j );
-            }
         }
     }
     RUNTIME_data_flush( options->sequence, Up(k, k) );
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 /*
@@ -284,8 +282,8 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
 {
     int m, h, b, nbblock, hmax, j;
     int tempkm, tempkn, tempmm, minmn;
-    void **clargs = malloc( sizeof(char *) * A->p );
-    memset( clargs, 0, sizeof(char *) * A->p );
+    void **clargs = malloc( sizeof(char *) );
+    memset( clargs, 0, sizeof(char *) );
 
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -306,10 +304,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
         for ( h = 0; h < hmax; h++ ) {
             j =  h + b * ws->ib;
 
-            INSERT_TASK_zgetrf_panel_blocked_batched( options, tempkm, tempkn, j, k * A->mb, (void *)ws,
-                                                      A(k, k), Up(k, k), clargs, ipiv );
-
-            for ( m = k + 1; m < A->mt; m++ ) {
+            for ( m = k; m < A->mt; m++ ) {
                 tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
                 INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb,
                                                           (void *)ws, A(m, k), Up(k, k), clargs, ipiv );
@@ -317,6 +312,10 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
             INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, k,
                                                             Up(k, k), clargs, ipiv );
 
+            assert( j <= minmn );
+            /* Reduce globally (between MPI processes) */
+            INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
+
             if ( (b < (nbblock-1)) && (h == hmax-1) ) {
                 INSERT_TASK_zgetrf_blocked_trsm(
                     options,
@@ -324,12 +323,6 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
                     Up(k, k),
                     ipiv );
             }
-
-            assert( j <= minmn );
-            if ( j < minmn ) {
-                /* Reduce globally (between MPI processes) */
-                INSERT_TASK_ipiv_reducek( options, ipiv, k, j );
-            }
         }
     }
 
@@ -337,7 +330,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 static inline void
@@ -347,6 +340,26 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
                                int                         k,
                                RUNTIME_option_t           *options )
 {
+#if defined ( CHAMELEON_USE_MPI )
+    int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) );
+    int  b;
+
+    /* 2DBC only */
+    ws->involved = 0;
+    for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) {
+        int rank = chameleon_getrankof_2d( A, b, k );
+        proc_involved[ b-k ] = rank;
+        if ( rank == A->myrank ) {
+            ws->involved = 1;
+        }
+    }
+    ws->proc_involved = proc_involved;
+    if ( ws->involved == 0 ) {
+	free( proc_involved );
+        return;
+    }
+#endif
+
     /* TODO: Should be replaced by a function pointer */
     switch( ws->alg ) {
     case ChamGetrfNoPivPerColumn:
@@ -354,7 +367,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPivPerColumn:
-        if ( ws->batch_size > 1 ) {
+        if ( ws->batch_size > 0 ) {
             chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options );
         }
         else {
@@ -363,7 +376,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPiv:
-        if ( ws->batch_size > 1 ) {
+        if ( ws->batch_size > 0 ) {
             chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options );
         }
         else {
@@ -376,6 +389,9 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
     default:
         chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options );
     }
+#if defined ( CHAMELEON_USE_MPI )
+    free( proc_involved );
+#endif
 }
 
 /**
@@ -503,7 +519,9 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
          * block column k.
          */
         options.forcesub = chameleon_involved_in_panelk_2dbc( A, k );
-        chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options );
+        if ( chameleon_involved_in_panelk_2dbc( A, k ) ) {
+            chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options );
+        }
         options.forcesub = 0;
 
         for (n = k+1; n < A->nt; n++) {
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index a94a05551e554645e23311a20a0f1d2ef7794dd5..8fb6734d3e15fe2cc25fb9c1664db8bc9a0f6987 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -19,6 +19,8 @@
  * @author Florent Pruvost
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
+ * @author Xavier Lacoste
  * @date 2024-03-16
  *
  * @precisions normal z -> s d c
@@ -88,15 +90,11 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         chameleon_cleanenv( algostr );
     }
 
-    ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 1 );
+    ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 );
     if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) {
         chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" );
         ws->batch_size = CHAMELEON_BATCH_SIZE;
     }
-    if ( (ws->batch_size > 1) && (CHAMELEON_Comm_rank() > 1) ) {
-        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE is unavailable in distributed, value forced to 1\n" );
-        ws->batch_size = 1;
-    }
 
     /* Allocation of U for permutation of the panels */
     if ( ws->alg == ChamGetrfNoPivPerColumn ) {
@@ -300,7 +298,7 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV )
     if ( ( ws->alg == ChamGetrfPPivPerColumn ) ||
          ( ws->alg == ChamGetrfPPiv ) )
     {
-        chameleon_ipiv_destroy( &descIPIV );
+        chameleon_ipiv_destroy( &descIPIV, &descAt );
     }
     CHAMELEON_zgetrf_WS_Free( ws );
     chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
diff --git a/control/compute_z.h b/control/compute_z.h
index 088e03140baff5b167727931a6fb9e6b7a1641f0..06c8854c1b05a7c1bffbca0fa8e615218363672d 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -48,6 +48,8 @@ struct chameleon_pzgetrf_s {
     int          batch_size; /**< Batch size for the panel    */
     CHAM_desc_t  U;
     CHAM_desc_t  Up;
+    int         *proc_involved;
+    int          involved:1;
 };
 
 /**
diff --git a/control/descriptor.h b/control/descriptor.h
index 306abe6c5d320076eac9ed7c06aa82d10926aa46..1e0315fae2c70cdec40052e49a58b47c32a46ec9 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -20,6 +20,7 @@
  * @author Raphael Boucherie
  * @author Samuel Thibault
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
@@ -77,7 +78,7 @@ void         chameleon_desc_destroy  ( CHAM_desc_t *desc );
 int          chameleon_desc_check    ( const CHAM_desc_t *desc );
 
 int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data );
-void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv );
+void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc );
 
 /**
  *  Internal function to return address of block (m,n) with m,n = block indices
diff --git a/control/descriptor_ipiv.c b/control/descriptor_ipiv.c
index e9631909b89689df5498c29da368298d8753bc40..c3369b7a4126ea0b245eb73ed2d3b547f7f11523 100644
--- a/control/descriptor_ipiv.c
+++ b/control/descriptor_ipiv.c
@@ -12,6 +12,8 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  ***
@@ -73,7 +75,7 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data
     ipiv->mt   = chameleon_ceil( ipiv->m, ipiv->mb );
 
     /* Create runtime specific structure like registering data */
-    RUNTIME_ipiv_create( ipiv );
+    RUNTIME_ipiv_create( ipiv, desc );
 
     return rc;
 }
@@ -91,9 +93,10 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data
  *          The pointer to the ipiv descriptor to destroy.
  *
  */
-void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void chameleon_ipiv_destroy( CHAM_ipiv_t       *ipiv,
+                             const CHAM_desc_t *desc )
 {
-    RUNTIME_ipiv_destroy( ipiv );
+    RUNTIME_ipiv_destroy( ipiv, desc );
 }
 
 /**
@@ -162,7 +165,8 @@ int CHAMELEON_Ipiv_Create( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void
  * @retval CHAMELEON_SUCCESS successful exit
  *
  */
-int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr)
+int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr,
+                            const CHAM_desc_t *desc )
 {
     CHAM_context_t *chamctxt;
     CHAM_ipiv_t *ipiv;
@@ -179,7 +183,7 @@ int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr)
     }
 
     ipiv = *ipivptr;
-    chameleon_ipiv_destroy( ipiv );
+    chameleon_ipiv_destroy( ipiv, desc );
     free(ipiv);
     *ipivptr = NULL;
     return CHAMELEON_SUCCESS;
diff --git a/include/chameleon.h b/include/chameleon.h
index f1d33549595e475ee4bf60514bd08689d5416b40..12c295a7732ef73f1a1fac421bb38be6f0cdd9ea 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -18,6 +18,8 @@
  * @author Florent Pruvost
  * @author Philippe Virouleau
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
+ * @author Loris Lucido
  * @date 2024-03-16
  *
  */
@@ -214,11 +216,16 @@ int  CHAMELEON_Recursive_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flt
                                       blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd,
                                       blkrankof_fct_t get_rankof, void* get_rankof_arg );
 
-int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void *data );
-int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr );
+int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t       **ipivptr,
+                            const CHAM_desc_t  *desc,
+                            void               *data );
+int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t       **ipivptr,
+                            const CHAM_desc_t  *desc );
 int CHAMELEON_Ipiv_Flush  ( const CHAM_ipiv_t        *ipiv,
                             const RUNTIME_sequence_t *sequence );
-int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc, int *ipiv, int root );
+int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc,
+                           int         *ipiv,
+                           int          root );
 void CHAMELEON_Ipiv_Print ( const CHAM_ipiv_t *ipiv );
 
 /**
diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h
index e64390f6c2c16d3c6c730748be710075e6e70f21..52993c9a6a8130bc1727a74777511bd03a3f48f3 100644
--- a/include/chameleon/runtime.h
+++ b/include/chameleon/runtime.h
@@ -18,6 +18,7 @@
  * @author Samuel Thibault
  * @author Philippe Swartvagher
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -717,8 +718,10 @@ void RUNTIME_ddisplay_oneprofile (cham_tasktype_t task);
 void RUNTIME_sdisplay_allprofile ();
 void RUNTIME_sdisplay_oneprofile (cham_tasktype_t task);
 
-void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv );
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv );
+void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv,
+                          const CHAM_desc_t *desc );
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv,
+                           const CHAM_desc_t *desc );
 void RUNTIME_ipiv_gather ( const RUNTIME_sequence_t *sequence,
                            CHAM_ipiv_t *desc, int *ipiv, int node );
 
@@ -730,18 +733,18 @@ void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m );
 
 void *RUNTIME_ipiv_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h );
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h );
 void *RUNTIME_perm_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
 void *RUNTIME_invp_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
 
 static inline void *
-RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) {
+RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int rank, int k, int h ) {
     if ( h%2 == 0 ) {
-        return RUNTIME_nextpiv_getaddr( ipiv, m, -1 );
+        return RUNTIME_nextpiv_getaddr( ipiv, rank, k, h );
     }
     else {
-        return RUNTIME_prevpiv_getaddr( ipiv, m, -1 );
+        return RUNTIME_prevpiv_getaddr( ipiv, rank, k, h );
     }
 }
 
diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index aa21e99d8f85c82b9484da1aa7d599d995b66bb4..99d70dbade30332f9af8ce5397636f8023a10e24 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -16,6 +16,7 @@
  * @author Cedric Augonnet
  * @author Florent Pruvost
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -165,7 +166,7 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options,
 void INSERT_TASK_ipiv_init   ( const RUNTIME_option_t *options,
                                CHAM_ipiv_t *ipiv );
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ws, int k, int h );
+                               CHAM_ipiv_t *ws, int k, int h, int rank );
 void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
                                int m0, int m, int k,
                                const CHAM_ipiv_t *ipivdesc, int ipivk );
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 795ebd2d186f9c1e88a44ab6312d40583b1a4d5d..93a5f6e303c8b00076e78fbe7faf58fc59dfe4f7 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -562,4 +562,12 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
                                       CHAM_desc_t *U, int Um, int Un,
                                       CHAM_ipiv_t *ws );
 
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n );
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index f7203fbe500d517ea64251ea198600944ce9291c..08279345b7f5d95ba633f3143f7c1b39fe2d6352 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -24,6 +24,7 @@
 #  @author Florent Pruvost
 #  @author Philippe Virouleau
 #  @author Matthieu Kuhn
+#  @author Alycia Lisito
 #  @date 2024-03-16
 #
 ###
@@ -73,6 +74,7 @@ set(CODELETS_ZSRC
     codelets/codelet_zhe2ge.c
     codelets/codelet_zherfb.c
     codelets/codelet_zhessq.c
+    codelets/codelet_zipiv_allreduce.c
     codelets/codelet_zlacpy.c
     codelets/codelet_zlange.c
     codelets/codelet_zlanhe.c
diff --git a/runtime/openmp/codelets/codelet_ipiv.c b/runtime/openmp/codelets/codelet_ipiv.c
index d6386bb58d09d584cc066371cb18bd6be3fad3b7..548d688fb10388791d18df0dbd495ee9e969a01a 100644
--- a/runtime/openmp/codelets/codelet_ipiv.c
+++ b/runtime/openmp/codelets/codelet_ipiv.c
@@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ipiv, int k, int h )
+                               CHAM_ipiv_t *ipiv, int k, int h, int rank )
 {
     assert( 0 );
     (void)options;
     (void)ipiv;
     (void)k;
     (void)h;
+    (void)rank;
 }
 
 void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..b088283254cd64e1bada1628939436327b8a2789
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c
@@ -0,0 +1,35 @@
+/**
+ *
+ * @file openmp/codelet_zipiv_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon openmp codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    (void)A;
+    (void)options;
+    (void)ipiv;
+    (void)proc_involved;
+    (void)k;
+    (void)h;
+    (void)n;
+}
diff --git a/runtime/openmp/control/runtime_descriptor_ipiv.c b/runtime/openmp/control/runtime_descriptor_ipiv.c
index 9514b6fd067af22b09d4c224e01eb0c87f4e4de1..3a727f01c3ae30eef51ee4273096b704d9e8e777 100644
--- a/runtime/openmp/control/runtime_descriptor_ipiv.c
+++ b/runtime/openmp/control/runtime_descriptor_ipiv.c
@@ -12,21 +12,27 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  */
 #include "chameleon_openmp.h"
 
-void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv,
+                          const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv,
+                           const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
 void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
@@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
diff --git a/runtime/parsec/codelets/codelet_ipiv.c b/runtime/parsec/codelets/codelet_ipiv.c
index b9ac7e05468ba805c97dac09f75ef1c37c63f928..46fee3ee85ac11a6a6cac20febfdd2f6ddde9712 100644
--- a/runtime/parsec/codelets/codelet_ipiv.c
+++ b/runtime/parsec/codelets/codelet_ipiv.c
@@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ipiv, int k, int h )
+                               CHAM_ipiv_t *ipiv, int k, int h, int rank )
 {
     assert( 0 );
     (void)options;
     (void)ipiv;
     (void)k;
     (void)h;
+    (void)rank;
 }
 
 static inline int
diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..75e0611647a464cad9c37e59a5619ebefaae19ed
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c
@@ -0,0 +1,35 @@
+/**
+ *
+ * @file parsec/codelet_zipiv_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon parsec codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    (void)A;
+    (void)options;
+    (void)ipiv;
+    (void)proc_involved;
+    (void)k;
+    (void)h;
+    (void)n;
+}
diff --git a/runtime/parsec/control/runtime_descriptor_ipiv.c b/runtime/parsec/control/runtime_descriptor_ipiv.c
index 53621950fff15975835e686fabc48c6fd3e7d9e4..6108199eb0fb89ffdb359afa48b725e79de86fdf 100644
--- a/runtime/parsec/control/runtime_descriptor_ipiv.c
+++ b/runtime/parsec/control/runtime_descriptor_ipiv.c
@@ -12,21 +12,27 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  */
 #include "chameleon_parsec.h"
 
-void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv,
+                          const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv,
+                           const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
 void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
@@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
diff --git a/runtime/quark/codelets/codelet_ipiv.c b/runtime/quark/codelets/codelet_ipiv.c
index ab982faf04523af148f8e8444a6469681a4a0ea6..5fc849b890a75f3447c425f1a458d11fd0c3df1c 100644
--- a/runtime/quark/codelets/codelet_ipiv.c
+++ b/runtime/quark/codelets/codelet_ipiv.c
@@ -28,13 +28,14 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ipiv, int k, int h )
+                               CHAM_ipiv_t *ipiv, int k, int h, int rank )
 {
     assert( 0 );
     (void)options;
     (void)ipiv;
     (void)k;
     (void)h;
+    (void)rank;
 }
 
 static inline void
diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..e88269e931f3f210282a1382d44a6ff9516c7453
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c
@@ -0,0 +1,35 @@
+/**
+ *
+ * @file quark/codelet_zipiv_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon quark codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    (void)A;
+    (void)options;
+    (void)ipiv;
+    (void)proc_involved;
+    (void)k;
+    (void)h;
+    (void)n;
+}
diff --git a/runtime/quark/control/runtime_descriptor_ipiv.c b/runtime/quark/control/runtime_descriptor_ipiv.c
index 9edd6f041c266cbd6c51cea48c359d283f3f619a..f5fa28fe5a6246a3e91476d574f9e771d162d40f 100644
--- a/runtime/quark/control/runtime_descriptor_ipiv.c
+++ b/runtime/quark/control/runtime_descriptor_ipiv.c
@@ -12,21 +12,27 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  */
 #include "chameleon_quark.h"
 
-void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv,
+                          const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv,
+                           const CHAM_desc_t *desc )
 {
     assert( 0 );
     (void)ipiv;
+    (void)desc;
 }
 
 void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
@@ -37,19 +43,21 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
+    (void)rank;
     (void)m;
     (void)h;
     return NULL;
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 7f2eab94cd34acadf580ed42f5c858562d927ae4..f01a36b14433bdac07cd795fe24cb865ee0cd228 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -33,6 +33,7 @@ cmake_minimum_required(VERSION 3.5)
 
 include(CheckSymbolExists)
 include(CheckStructHasMember)
+include(CheckCSourceRuns)
 
 set(CHAMELEON_STARPU_VERSION "1.3" CACHE STRING "necessary STARPU API version")
 
@@ -90,6 +91,25 @@ if ( STARPU_FOUND )
     message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}")
   endif()
 
+  # Check if STARPU_NONE equals 0 or not
+  set(C_STARPU_NONE_NONZERO "
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+int main() {
+  if (STARPU_NONE == 0)
+    return 1;
+  else
+    return 0;
+}
+")
+
+  unset(HAVE_STARPU_NONE_NONZERO CACHE)
+  check_c_source_runs("${C_STARPU_NONE_NONZERO}" HAVE_STARPU_NONE_NONZERO)
+  if ( HAVE_STARPU_NONE_NONZERO )
+    message("-- ${Blue}Add definition HAVE_STARPU_NONE_NONZERO${ColourReset}")
+  endif()
+
   if (CHAMELEON_USE_MPI)
     # Add MPI in case StarPU don't have a public dependency on it
     check_function_exists(starpu_mpi_init_conf HAVE_STARPU_MPI_INIT_CONF)
diff --git a/runtime/starpu/codelets/codelet_ipiv.c b/runtime/starpu/codelets/codelet_ipiv.c
index 64e6031391793de8dc829e2ac47eddabfdba7be5..e5dba252a6312d625a825485cc84d0657973f435 100644
--- a/runtime/starpu/codelets/codelet_ipiv.c
+++ b/runtime/starpu/codelets/codelet_ipiv.c
@@ -12,6 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -62,13 +63,13 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ipiv, int k, int h )
+                               CHAM_ipiv_t *ipiv, int k, int h, int rank )
 {
-    starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, k, h-1 );
+    starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h-1 );
 
 #if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI)
 #if !defined(HAVE_STARPU_MPI_REDUX_WRAPUP)
-    starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, k, h   );
+    starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h   );
     if ( h < ipiv->n ) {
         starpu_mpi_redux_data_prio_tree( options->sequence->comm, nextpiv,
                                          options->priority, 2 /* Binary tree */ );
diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c
index 1d4cb37da9bc6099305ddcf9eb4516fb17feaf52..d9c55d76cd3fa290ab004ebc854e3d5f4638cf93 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_batched.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c
@@ -43,15 +43,16 @@ cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
     struct cl_getrf_batched_args_t *clargs  = (struct cl_getrf_batched_args_t *) cl_arg;
-    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[0];
-    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[1];
+    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ];
+    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ];
     int                             i, m, n, h, m0, lda;
     CHAM_tile_t                    *tileA;
 
     nextpiv->h = clargs->h;
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag );
 
     for ( i = 0; i < clargs->tasks_nbr; i++ ) {
-        tileA = cti_interface_get( descr[ i + 2 ] );
+        tileA = cti_interface_get( descr[ i ] );
         lda   = tileA->ld;
         m     = clargs->m[ i ];
         n     = clargs->n[ i ];
@@ -77,6 +78,15 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
     int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->get_rankof( A, Am, An );
+    if ( rankA != A->myrank ) {
+        return;
+    }
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -85,6 +95,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
 
     if ( clargs == NULL ) {
         clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
+        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
         clargs->tasks_nbr   = 0;
         clargs->h           = h;
         clargs->cl_name     = "zgetrf_panel_offdiag_batched";
@@ -104,13 +115,15 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
                                               A->get_blktile( A, Am, An ) );
 
     if ( clargs->tasks_nbr == batch_size ) {
+        int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+        int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
         rt_starpu_insert_task(
             &cl_zgetrf_panel_offdiag_batched,
             /* Task codelet arguments */
             STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-            STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, h   ),
-            STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
             STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -132,18 +145,26 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
 {
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->myrank;
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
 
     if ( clargs == NULL ) {
         return;
     }
+    int access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( clargs->h == 0 )       ? STARPU_NONE : STARPU_R;
 
     rt_starpu_insert_task(
         &cl_zgetrf_panel_offdiag_batched,
         /* Task codelet arguments */
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-        STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, clargs->h   ),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, clargs->h-1 ),
         STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -162,20 +183,27 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
     struct cl_getrf_batched_args_t *clargs  = ( struct cl_getrf_batched_args_t * ) cl_arg;
-    int                            *ipiv    = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr]);
-    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
-    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 2];
+    int                            *ipiv;
+    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ];
+    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
     int                             i, h, ib;
     CHAM_tile_t                    *tileA, *tileU;
     CHAMELEON_Complex64_t          *U   = NULL;
     int                             ldu = -1;
 
     nextpiv->h = clargs->h;
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
     h  = clargs->h;
     ib = clargs->ib;
     i  = 0;
     if ( clargs->diag ) {
+        if ( h == 0 ) {
+            ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 1]);
+        }
+        else {
+            ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 2]);
+        }
         if ( h != 0 ) {
             tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] );
             U     = CHAM_tile_get_ptr( tileU );
@@ -190,7 +218,7 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
         i++;
     }
     if ( ( h%ib == 0 ) && ( h > 0 ) ) {
-        tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] );
+        tileU = cti_interface_get( descr[ clargs->tasks_nbr + 2 + clargs->diag ] );
         U     = CHAM_tile_get_ptr( tileU );
         ldu   = tileU->ld;
     }
@@ -225,6 +253,28 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
     void (*callback)(void*) = NULL;
     int accessU, access_npiv, access_ipiv, access_ppiv;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->get_rankof(A, Am, An);
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
+
+#if defined ( CHAMELEON_USE_MPI )
+    if ( ( Am == An ) && ( h % ib == 0 ) && ( h > 0 ) ) {
+        starpu_mpi_cache_flush( options->sequence->comm,
+                                RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) );
+    }
+
+    if ( rankA != A->myrank ) {
+        if ( ( h % ib == 0 ) && ( h > 0 ) && ( A->myrank == A->get_rankof( A, An, An ) ) ) {
+            starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+                                                  rankA, NULL, NULL );
+        }
+        return;
+    }
+#endif
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -232,7 +282,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     if ( clargs == NULL ) {
-        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
+        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) );
+        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
         clargs->tasks_nbr         = 0;
         clargs->diag              = ( Am == An );
         clargs->ib                = ib;
@@ -271,24 +322,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
         }
         /* If there isn't a diag task then use offdiag access */
         if ( clargs->diag == 0 ) {
-            accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+            accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+            access_ipiv = STARPU_NONE;
         }
 
         rt_starpu_insert_task(
             &cl_zgetrf_panel_blocked_batched,
             /* Task codelet arguments */
             STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+            access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+            accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
             STARPU_NAME,              clargs->cl_name,
 #endif
-            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-            access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-            access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h ),
-            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-            accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
             0);
 
         /* clargs is freed by starpu. */
@@ -306,6 +358,12 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     int accessU, access_npiv, access_ipiv, access_ppiv;
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->myrank;
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
 
     if ( clargs == NULL ) {
         return;
@@ -328,24 +386,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     }
     /* If there isn't a diag task then use offdiag access */
     if ( clargs->diag == 0 ) {
-        accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE;
+        accessU     = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE;
+        access_ipiv = STARPU_NONE;
     }
 
     rt_starpu_insert_task(
         &cl_zgetrf_panel_blocked_batched,
         /* Task codelet arguments */
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+        STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              clargs->cl_name,
 #endif
-        STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, clargs->h ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, clargs->h - 1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
         0);
 
     /* clargs is freed by starpu. */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
index 2c6daa18d9bda1f7ff433305aa98ad77f648b4b5..8739f27deb22f8ba019fa85338c4fdcbc0a0d789 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
@@ -14,6 +14,7 @@
  *
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-11
  * @precisions normal z -> c d s
  *
@@ -67,6 +68,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
     nextpiv->h        = h;
     nextpiv->has_diag = 1;
 
+    coreblas_kernel_trace( tileA );
     CORE_zgetrf_panel_diag( m, n, h, m0, ib,
                             CHAM_tile_get_ptr( tileA ), tileA->ld,
                             U, ldu,
@@ -95,6 +97,22 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag;
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL;
     const char *cl_name = "zgetrf_blocked_diag";
+    int rankA           = A->get_rankof(A, Am, An);
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
+
+#if defined ( CHAMELEON_USE_MPI )
+    if ( ( h % ib == 0 ) && ( h > 0 ) ) {
+        starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) );
+    }
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
+#endif
 
     int access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
@@ -108,7 +126,7 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
     else if ( h%ib == 0 ) {
         accessU = STARPU_R;
     }
-    else if ( h%ib == 1 ) {
+    else if ( ( h%ib == 1 ) || ( ib == 1 ) ) {
         accessU = STARPU_W;
     }
 
@@ -130,25 +148,24 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         0);
 }
 
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                    m, n, h, m0, ib;
+    int                    m, n, h, k, m0, ib;
     RUNTIME_sequence_t    *sequence;
     RUNTIME_request_t     *request;
     CHAM_tile_t           *tileA;
@@ -156,9 +173,9 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
     cppi_interface_t      *nextpiv;
     cppi_interface_t      *prevpiv;
     CHAMELEON_Complex64_t *U   = NULL;
-    int                    ldu = -1;;
+    int                    ldu = -1;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, &sequence, &request );
+    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request );
 
     tileA   = cti_interface_get(descr[0]);
     nextpiv = (cppi_interface_t*) descr[1];
@@ -169,12 +186,28 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
         ldu   = tileU->ld;
     }
 
+    if ( h > 0 ) {
+        cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " );
+    }
+    if ( h < tileA->n ) {
+        cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " );
+    }
+
     nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
+    coreblas_kernel_trace( tileA );
     CORE_zgetrf_panel_offdiag( m, n, h, m0, ib,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
                                U, ldu,
                                &(nextpiv->pivot), &(prevpiv->pivot) );
+
+    if ( h > 0 ) {
+        cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " );
+    }
+    if ( h < tileA->n ) {
+        cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " );
+    }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -190,9 +223,29 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
                                          CHAM_ipiv_t *ipiv )
 {
     struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag;
+
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
     int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
     int accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+    int rankA       = A->get_rankof(A, Am, An);
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
+
+#if defined ( CHAMELEON_USE_MPI )
+    if ( rankA != A->myrank ) {
+        if ( ( accessU != STARPU_NONE ) &&
+             ( A->myrank == A->get_rankof( A, An, An ) ) )
+        {
+            starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+                                                  rankA, NULL, NULL );
+        }
+        return;
+    }
+#endif
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_blocked_offdiag";
@@ -200,6 +253,9 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW( A, Am, An );
+    if ((h%ib == 0) && (h > 0)) {
+        CHAMELEON_ACCESS_R( U, Um, Un );
+    }
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
@@ -211,21 +267,21 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &m,                   sizeof(int),
         STARPU_VALUE,             &n,                   sizeof(int),
         STARPU_VALUE,             &h,                   sizeof(int),
+        STARPU_VALUE,             &An,                  sizeof(int),
         STARPU_VALUE,             &m0,                  sizeof(int),
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         0);
 }
 
@@ -247,6 +303,8 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg)
     U       = CHAM_tile_get_ptr( tileU );
     ldu     = tileU->ld;
 
+    coreblas_kernel_trace( tileU );
+
     /* Copy the final max line of the block and solve */
     cblas_zcopy( n, prevpiv->pivot.pivrow, 1,
                     U + m - 1, ldu );
@@ -276,6 +334,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL;
     const char *cl_name = "zgetrf_blocked_trsm";
+    int rankU = U->get_rankof(U, Um, Un);
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -286,6 +345,10 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
     cl_name = chameleon_codelet_name( cl_name, 1,
                                       U->get_blktile( U, Um, Un ) );
 
+    if ( U->myrank != U->get_rankof(U, Um, Un) ) {
+        return;
+    }
+
     rt_starpu_insert_task(
         codelet,
         STARPU_VALUE,             &m,                   sizeof(int),
@@ -293,7 +356,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
         STARPU_VALUE,             &h,                   sizeof(int),
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, Un, h-1 ),
+        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c
index 5d3f83b6ce046a72135c8f513c8cc23822159595..0b556f81605a9cc78faea6fa6e312ffc0e643631 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_percol.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c
@@ -14,6 +14,7 @@
  *
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-11
  * @precisions normal z -> c d s
  *
@@ -84,6 +85,17 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_zgetrf_percol_diag;
     void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_diag_callback : NULL;
     const char *cl_name = "zgetrf_percol_diag";
+    int rankA           = A->get_rankof(A, Am, An);
+
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
 
     int access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
@@ -95,8 +107,7 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
-    cl_name = chameleon_codelet_name( cl_name, 1,
-                                      A->get_blktile( A, Am, An ) );
+    cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
         codelet,
@@ -106,17 +117,16 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &m0,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
         0);
 }
 
@@ -137,6 +147,7 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg)
     prevpiv = (cppi_interface_t*) descr[2];
 
     nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
     CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
@@ -159,6 +170,18 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_percol_offdiag";
+    int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+    int rankA       = A->get_rankof(A, Am, An);
+#if !defined(HAVE_STARPU_NONE_NONZERO)
+    /* STARPU_NONE can't be equal to 0 */
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    assert( 0 );
+#endif
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -166,8 +189,7 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
-    cl_name = chameleon_codelet_name( cl_name, 1,
-                                      A->get_blktile( A, Am, An ) );
+    cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
         codelet,
@@ -178,8 +200,8 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
new file mode 100644
index 0000000000000000000000000000000000000000..13a41ceb04be76b2f89419a20bd6209d3aebd6e3
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
@@ -0,0 +1,169 @@
+/**
+ *
+ * @file starpu/codelet_zipiv_allreduce.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to do the reduction
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-06-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+#include <coreblas/cblas_wrapper.h>
+
+#if defined(CHAMELEON_USE_MPI)
+struct cl_redux_args_t {
+    int h;
+    int n;
+    int k;
+};
+
+static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_redux_args_t *clargs      = (struct cl_redux_args_t *) cl_arg;
+    cppi_interface_t       *cppi_me     = ((cppi_interface_t *) descr[0]);
+    cppi_interface_t       *cppi_src    = ((cppi_interface_t *) descr[1]);
+    CHAM_pivot_t           *nextpiv_me  = &(cppi_me->pivot);
+    CHAM_pivot_t           *nextpiv_src = &(cppi_src->pivot);
+    CHAMELEON_Complex64_t  *pivrow_me   = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow);
+    CHAMELEON_Complex64_t  *pivrow_src  = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow);
+
+    cppi_display_dbg( cppi_me,  stderr, "Global redux Inout: ");
+    cppi_display_dbg( cppi_src, stderr, "Global redux Input: ");
+
+    assert( cppi_me->n         == cppi_src->n         );
+    assert( cppi_me->h         == cppi_src->h         );
+    assert( cppi_me->flttype   == cppi_src->flttype   );
+    assert( cppi_me->arraysize == cppi_src->arraysize );
+
+    if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) {
+        nextpiv_me->blkm0  = nextpiv_src->blkm0;
+        nextpiv_me->blkidx = nextpiv_src->blkidx;
+        cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 );
+    }
+
+    /* Let's copy the diagonal row if needed */
+    if ( ( cppi_src->has_diag == 1 ) &&
+         ( cppi_me->has_diag  == -1 ) )
+    {
+        cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 );
+        assert( cppi_src->arraysize == clargs->n * sizeof(CHAMELEON_Complex64_t) );
+        cppi_me->has_diag = 1;
+    }
+
+    cppi_display_dbg( cppi_me,  stderr, "Global redux Inout(After): ");
+}
+
+CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func )
+
+void
+INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv,
+                                  int          me,
+                                  int          dst,
+                                  int          k,
+                                  int          h,
+                                  const RUNTIME_option_t *options )
+{
+    rt_starpu_insert_task(
+        NULL,
+        STARPU_EXECUTE_ON_NODE, dst,
+        STARPU_R,               RUNTIME_pivot_getaddr( ipiv, me, k, h ),
+        STARPU_PRIORITY,        options->priority,
+        0 );
+}
+
+void
+INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv,
+                                  int          me,
+                                  int          src,
+                                  int          k,
+                                  int          h,
+                                  int          n,
+                                  const RUNTIME_option_t *options )
+{
+    struct cl_redux_args_t *clargs;
+    clargs = malloc( sizeof( struct cl_redux_args_t ) );
+    clargs->h = h;
+    clargs->n = n;
+    clargs->k = k;
+
+    rt_starpu_insert_task(
+        &cl_zipiv_allreduce,
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_t),
+        STARPU_RW,                RUNTIME_pivot_getaddr( ipiv, me,  k, h ),
+        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, src, k, h ),
+        STARPU_EXECUTE_ON_NODE,   me,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_PRIORITY,          options->priority,
+        0 );
+    starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) );
+}
+
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    int np_involved   = chameleon_min( A->p, A->mt - k);
+    int np_iter       = np_involved;
+    int p_recv, p_send, me;
+    int shift = 1;
+
+    if ( h > 0 ) {
+        starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) );
+    }
+    if ( h >= ipiv->n ) {
+        return;
+    }
+
+    if ( np_involved == 1 ) {
+        assert( proc_involved[0] == A->myrank );
+    }
+    else {
+        for( me = 0; me < np_involved; me++ ) {
+            if ( proc_involved[me] == A->myrank ) {
+                break;
+            }
+        }
+        assert( me < np_involved );
+        while ( np_iter > 1 ) {
+            p_send = proc_involved[ ( me + shift               ) % np_involved ];
+            p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ];
+
+            INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h,    options );
+            INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options );
+
+            shift   = shift << 1;
+            np_iter = chameleon_ceil( np_iter, 2 );
+        }
+    }
+}
+#else
+void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
+                                  const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                    *proc_involved,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    if ( h > 0 ) {
+        starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) );
+    }
+
+    (void)options;
+    (void)proc_involved;
+    (void)n;
+}
+#endif
diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c
index 48be66e17652b487c246c2eec0dd8211d7890b36..1ad0f7a142fd9272a3ffb445dd797db774959d60 100644
--- a/runtime/starpu/control/runtime_descriptor_ipiv.c
+++ b/runtime/starpu/control/runtime_descriptor_ipiv.c
@@ -12,6 +12,8 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  */
@@ -20,16 +22,18 @@
 /**
  *  Create ws_pivot runtime structures
  */
-void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_create( CHAM_ipiv_t       *ipiv,
+                          const CHAM_desc_t *desc )
 {
     assert( ipiv );
-    starpu_data_handle_t *handles = calloc( 5 * ipiv->mt, sizeof(starpu_data_handle_t) );
+    size_t                nbhandles = 3 * ipiv->mt + 2 * desc->p;
+    starpu_data_handle_t *handles   = calloc( nbhandles, sizeof(starpu_data_handle_t) );
     ipiv->ipiv    = handles;
     handles += ipiv->mt;
     ipiv->nextpiv = handles;
-    handles += ipiv->mt;
+    handles += desc->p;
     ipiv->prevpiv = handles;
-    handles += ipiv->mt;
+    handles += desc->p;
     ipiv->perm    = handles;
     handles += ipiv->mt;
     ipiv->invp    = handles;
@@ -40,14 +44,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
      */
     {
         chameleon_starpu_tag_init();
-        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 5 );
+        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( nbhandles );
         if ( ipiv->mpitag_ipiv == -1 ) {
             chameleon_fatal_error("RUNTIME_ipiv_create", "Can't pursue computation since no more tags are available for ipiv structure");
             return;
         }
         ipiv->mpitag_nextpiv = ipiv->mpitag_ipiv    + ipiv->mt;
-        ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + ipiv->mt;
-        ipiv->mpitag_perm    = ipiv->mpitag_prevpiv + ipiv->mt;
+        ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + desc->p;
+        ipiv->mpitag_perm    = ipiv->mpitag_prevpiv + desc->p;
         ipiv->mpitag_invp    = ipiv->mpitag_perm    + ipiv->mt;
     }
 #endif
@@ -56,12 +60,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
 /**
  *  Destroy ws_pivot runtime structures
  */
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t       *ipiv,
+                           const CHAM_desc_t *desc )
 {
     int                   i;
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv);
+    size_t                nbhandles = 3 * ipiv->mt + 2 * desc->p;
 
-    for(i=0; i<(5 * ipiv->mt); i++) {
+    for(i=0; i<nbhandles; i++) {
         if ( *handle != NULL ) {
             starpu_data_unregister( *handle );
             *handle = NULL;
@@ -107,49 +113,51 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
     return (void*)(*handle);
 }
 
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h )
 {
     starpu_data_handle_t *nextpiv = (starpu_data_handle_t*)(ipiv->nextpiv);
-    int64_t mm = m + (ipiv->i / ipiv->mb);
+    const CHAM_desc_t *A = ipiv->desc;
 
-    nextpiv += mm;
+    nextpiv += rank/A->q;
     assert( nextpiv );
 
     if ( *nextpiv != NULL ) {
         return (void*)(*nextpiv);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
-    int64_t tag   = ipiv->mpitag_nextpiv + mm;
+    int64_t kk    = k + (ipiv->i / ipiv->mb);
+    int     owner = rank;
+    int     ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb;
+    int64_t tag   = ipiv->mpitag_nextpiv + owner/A->q;
 
     cppi_register( nextpiv, A->dtyp, ncols, tag, owner );
 
     assert( *nextpiv );
+    (void)h;
     return (void*)(*nextpiv);
 }
 
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h )
 {
     starpu_data_handle_t *prevpiv = (starpu_data_handle_t*)(ipiv->prevpiv);
-    int64_t mm = m + (ipiv->i / ipiv->mb);
+    const CHAM_desc_t *A = ipiv->desc;
 
-    prevpiv += mm;
+    prevpiv += rank/A->q;
     assert( prevpiv );
 
     if ( *prevpiv != NULL ) {
         return (void*)(*prevpiv);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
-    int64_t tag   = ipiv->mpitag_prevpiv + mm;
+    int64_t kk    = k + (ipiv->i / ipiv->mb);
+    int     owner = rank;
+    int     ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb;
+    int64_t tag   = ipiv->mpitag_prevpiv + owner/A->q;
 
     cppi_register( prevpiv, A->dtyp, ncols, tag, owner );
 
     assert( *prevpiv );
+    (void)h;
     return (void*)(*prevpiv);
 }
 
@@ -212,19 +220,18 @@ void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m )
 }
 
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
-                          const CHAM_ipiv_t *ipiv, int m )
+                          const CHAM_ipiv_t *ipiv, int rank )
 {
     starpu_data_handle_t *handle;
     const CHAM_desc_t *A = ipiv->desc;
-    int64_t mm = m + ( ipiv->i / ipiv->mb );
 
     handle = (starpu_data_handle_t*)(ipiv->nextpiv);
-    handle += mm;
+    handle += rank/A->q;
 
     if ( *handle != NULL ) {
 #if defined(CHAMELEON_USE_MPI)
         starpu_mpi_cache_flush( sequence->comm, *handle );
-        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+        if ( starpu_mpi_data_get_rank( *handle ) == rank )
 #endif
         {
             chameleon_starpu_data_wont_use( *handle );
@@ -232,12 +239,12 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
     }
 
     handle = (starpu_data_handle_t*)(ipiv->prevpiv);
-    handle += mm;
+    handle += rank/A->q;
 
     if ( *handle != NULL ) {
 #if defined(CHAMELEON_USE_MPI)
         starpu_mpi_cache_flush( sequence->comm, *handle );
-        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+        if ( starpu_mpi_data_get_rank( *handle ) == rank )
 #endif
         {
             chameleon_starpu_data_wont_use( *handle );
@@ -246,7 +253,7 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
 
     (void)sequence;
     (void)ipiv;
-    (void)m;
+    (void)rank;
 }
 
 void RUNTIME_ipiv_flush( const RUNTIME_sequence_t *sequence,
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 41949dfbb7c345050a5260b47646276c7af57002..6c4632da84520449a2e2c9f96fedef2209d196e9 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -40,6 +40,7 @@
 #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS
 #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE
 #cmakedefine HAVE_STARPU_PARALLEL_WORKER
+#cmakedefine HAVE_STARPU_NONE_NONZERO
 
 #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE
 #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER
diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h
index 7dbd10118c6bee637c9c49b6bc5bdf9d3fc008e2..8113c453fb2d344bda5972430e9e8337a3b0f880 100644
--- a/runtime/starpu/include/cppi_interface.h
+++ b/runtime/starpu/include/cppi_interface.h
@@ -12,6 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
@@ -44,6 +45,7 @@ struct cppi_interface_s
 void cppi_interface_init();
 void cppi_interface_fini();
 
+CHAM_pivot_t *cppi_handle_get( starpu_data_handle_t handle );
 void cppi_register( starpu_data_handle_t *handleptr,
                     cham_flttype_t        flttype,
                     int                   n,
@@ -61,13 +63,14 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title )
     diagrow = cppi_interface->pivot.diagrow;
     pivrow  = cppi_interface->pivot.pivrow;
 
-    fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d\n",
+    fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d, interf = %p\n",
              title,
              cppi_interface->n,
              cppi_interface->h,
              cppi_interface->has_diag,
              cppi_interface->pivot.blkm0,
-             cppi_interface->pivot.blkidx );
+             cppi_interface->pivot.blkidx,
+             cppi_interface );
 
     fprintf(stderr, "Diagonal row: " );
     for( i=0; i<cppi_interface->n; i++) {
diff --git a/runtime/starpu/interface/cppi_interface.c b/runtime/starpu/interface/cppi_interface.c
index 2d1754ec1cee030c040ee1c941cf283a6f58b284..6b1f8063180e78dbebf1ea443ee28f98920a7723 100644
--- a/runtime/starpu/interface/cppi_interface.c
+++ b/runtime/starpu/interface/cppi_interface.c
@@ -12,13 +12,14 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
 #include "chameleon_starpu.h"
 #undef HAVE_STARPU_REUSE_DATA_ON_NODE
 
-static inline CHAM_pivot_t *
+CHAM_pivot_t *
 cppi_handle_get( starpu_data_handle_t handle )
 {
     cppi_interface_t *cppi_interface = (cppi_interface_t *)
@@ -38,7 +39,7 @@ cppi_init( void *data_interface )
     cppi_interface_t *cppi_interface = (cppi_interface_t *)data_interface;
     cppi_interface->id = CPPI_INTERFACE_ID;
     cppi_interface->h  = -1;
-    cppi_interface->has_diag = 0;
+    cppi_interface->has_diag = -1;
 }
 
 static void
@@ -83,7 +84,7 @@ cppi_allocate_data_on_node( void *data_interface, unsigned node )
 
     /* update the data properly in consequence */
     cppi_interface->h = -1;
-    cppi_interface->has_diag = 0;
+    cppi_interface->has_diag = -1;
     cppi_interface->pivot.pivrow  = dataptr;
     cppi_interface->pivot.diagrow = ((char*)dataptr) + cppi_interface->arraysize;
 
@@ -279,8 +280,10 @@ cppi_describe( void *data_interface, char *buf, size_t size )
 {
     cppi_interface_t *cppi_interface = (cppi_interface_t *) data_interface;
 
-    return snprintf( buf, size, "Pivot structure, n %d, blkm0 %d, blkidx %d",
+    return snprintf( buf, size, "Pivot structure, n %d, h %d, has_diag = %d, blkm0 %d, blkidx %d",
                      cppi_interface->n,
+                     cppi_interface->h,
+                     cppi_interface->has_diag,
                      cppi_interface->pivot.blkm0,
                      cppi_interface->pivot.blkidx );
 }
@@ -298,6 +301,7 @@ cppi_copy_any_to_any( void *src_interface, unsigned src_node,
     STARPU_ASSERT( cppi_interface_src->flttype == cppi_interface_dst->flttype );
 
     cppi_interface_dst->h            = cppi_interface_src->h;
+    cppi_interface_dst->has_diag     = cppi_interface_src->has_diag;
     cppi_interface_dst->pivot.blkm0  = cppi_interface_src->pivot.blkm0;
     cppi_interface_dst->pivot.blkidx = cppi_interface_src->pivot.blkidx;
 
@@ -402,8 +406,8 @@ cl_cppi_redux_cpu_func(void *descr[], void *cl_arg)
     assert( cppi_redux->h == cppi_input->h );
 
     /* Let's copy the diagonal row if needed */
-    if ( cppi_input->has_diag ) {
-        assert( cppi_redux->has_diag == 0 );
+    if ( cppi_input->has_diag == 1 ) {
+        assert( cppi_redux->has_diag == -1 );
 
         memcpy( cppi_redux->pivot.diagrow,
                 cppi_input->pivot.diagrow,
@@ -449,7 +453,7 @@ cl_cppi_init_redux_cpu_func( void *descr[], void *cl_arg )
     cppi_interface_t *cppi_redux = ((cppi_interface_t *) descr[0]);
 
     /* Redux pivot never has diagonal at initialization */
-    cppi_redux->has_diag = 0;
+    cppi_redux->has_diag = -1;
     cppi_redux->h        = -1;
 
     size_t size = cppi_redux->arraysize;
@@ -497,7 +501,7 @@ cppi_register( starpu_data_handle_t *handleptr,
             .id = CPPI_INTERFACE_ID,
             .arraysize = n * CHAMELEON_Element_Size( flttype ),
             .flttype = flttype,
-            .has_diag = 0,
+            .has_diag = -1,
             .h  = -1,
             .n  = n,
         };
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index a1b637f681ed0bb82a981e65cd26310b03b514b7..c185e50b525c719a7b62422f89d5d5b9a259c435 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -88,28 +88,25 @@ if (NOT CHAMELEON_SIMULATION)
         if ( CHAMELEON_SCHED_STARPU )
             add_test( test_${cat}_${prec}getrf_nopivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 --diag=ChamUnit -f input/getrf_nopiv.in )
             set_tests_properties( test_${cat}_${prec}getrf_nopivpercol
-                                PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" )
+                                  PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" )
 
-            add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
-            set_tests_properties( test_${cat}_${prec}getrf_ppivpercol
-                                PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=1" )
+            if ( HAVE_STARPU_NONE_NONZERO )
+                add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
+                set_tests_properties( test_${cat}_${prec}getrf_ppivpercol
+                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" )
 
-            if ( ${cat} STREQUAL "shm" )
                 add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
                 set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch
                                     PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" )
-            endif()
 
-            add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
-            set_tests_properties( test_${cat}_${prec}getrf_ppiv
-                                PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=1" )
+                add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
+                set_tests_properties( test_${cat}_${prec}getrf_ppiv
+                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" )
 
-            if ( ${cat} STREQUAL "shm" )
                 add_test( test_${cat}_${prec}getrf_ppiv_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
                 set_tests_properties( test_${cat}_${prec}getrf_ppiv_batch
-                                      PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" )
+                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" )
             endif()
-
         endif()
 
         list( REMOVE_ITEM TESTSTMP print gepdf_qr )
diff --git a/testing/testing_zgetrf.c b/testing/testing_zgetrf.c
index dc978bc6f13b224ebe19fdaf3a653ad4e09cd56f..4645631a7a86a72e5c8fe2fa5f8b40e61991e66c 100644
--- a/testing/testing_zgetrf.c
+++ b/testing/testing_zgetrf.c
@@ -151,8 +151,8 @@ testing_zgetrf_desc( run_arg_list_t *args, int check )
         CHAMELEON_zgetrf_WS_Free( ws );
     }
 
+    CHAMELEON_Ipiv_Destroy( &descIPIV, descA );
     parameters_desc_destroy( &descA );
-    CHAMELEON_Ipiv_Destroy( &descIPIV );
 
     return hres;
 }