From 26cce899fb40152b42791d36089335d8517c0436 Mon Sep 17 00:00:00 2001
From: Alycia Lisito <alycia.lisito@inria.fr>
Date: Wed, 24 Jul 2024 16:33:19 +0200
Subject: [PATCH] zgetrf: make a prevpiv and a nextpiv per mpi process involved
 in panel

---
 compute/pzgetrf.c                             | 16 ++---
 compute/zgetrf.c                              |  6 +-
 control/descriptor.h                          |  3 +-
 control/descriptor_ipiv.c                     | 14 ++--
 include/chameleon.h                           | 13 +++-
 include/chameleon/runtime.h                   | 17 +++--
 include/chameleon/tasks.h                     |  3 +-
 runtime/starpu/codelets/codelet_ipiv.c        |  7 +-
 .../starpu/codelets/codelet_zgetrf_batched.c  | 68 ++++++++++++-------
 .../starpu/codelets/codelet_zgetrf_blocked.c  | 56 +++++++++++----
 .../starpu/codelets/codelet_zgetrf_percol.c   | 24 ++++---
 .../starpu/control/runtime_descriptor_ipiv.c  | 67 ++++++++++--------
 testing/testing_zgetrf.c                      |  2 +-
 13 files changed, 185 insertions(+), 111 deletions(-)

diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 584a96596..cdc98668c 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -149,13 +149,13 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
 
         if ( h < minmn ) {
             /* Reduce globally (between MPI processes) */
-            INSERT_TASK_ipiv_reducek( options, ipiv, k, h );
+            INSERT_TASK_ipiv_reducek( options, ipiv, k, h, A->myrank );
         }
     }
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 /*
@@ -198,7 +198,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
 
         if ( h < minmn ) {
             /* Reduce globally (between MPI processes) */
-            INSERT_TASK_ipiv_reducek( options, ipiv, k, h );
+            INSERT_TASK_ipiv_reducek( options, ipiv, k, h, A->myrank );
         }
     }
 
@@ -206,7 +206,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 static inline void
@@ -266,7 +266,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
             assert( j<= minmn );
             if ( j < minmn ) {
                 /* Reduce globally (between MPI processes) */
-                INSERT_TASK_ipiv_reducek( options, ipiv, k, j );
+                INSERT_TASK_ipiv_reducek( options, ipiv, k, j, A->myrank );
             }
         }
     }
@@ -274,7 +274,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 /*
@@ -330,7 +330,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
             assert( j <= minmn );
             if ( j < minmn ) {
                 /* Reduce globally (between MPI processes) */
-                INSERT_TASK_ipiv_reducek( options, ipiv, k, j );
+                INSERT_TASK_ipiv_reducek( options, ipiv, k, j, A->myrank );
             }
         }
     }
@@ -339,7 +339,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
 
     /* Flush temporary data used for the pivoting */
     INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
-    RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
+    RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
 }
 
 static inline void
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index a94a05551..508a78125 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -19,6 +19,8 @@
  * @author Florent Pruvost
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
+ * @author Xavier Lacoste
  * @date 2024-03-16
  *
  * @precisions normal z -> s d c
@@ -88,7 +90,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         chameleon_cleanenv( algostr );
     }
 
-    ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 1 );
+    ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 );
     if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) {
         chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" );
         ws->batch_size = CHAMELEON_BATCH_SIZE;
@@ -300,7 +302,7 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV )
     if ( ( ws->alg == ChamGetrfPPivPerColumn ) ||
          ( ws->alg == ChamGetrfPPiv ) )
     {
-        chameleon_ipiv_destroy( &descIPIV );
+        chameleon_ipiv_destroy( &descIPIV, &descAt );
     }
     CHAMELEON_zgetrf_WS_Free( ws );
     chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
diff --git a/control/descriptor.h b/control/descriptor.h
index 306abe6c5..1e0315fae 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -20,6 +20,7 @@
  * @author Raphael Boucherie
  * @author Samuel Thibault
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
  * @date 2023-08-22
  *
  */
@@ -77,7 +78,7 @@ void         chameleon_desc_destroy  ( CHAM_desc_t *desc );
 int          chameleon_desc_check    ( const CHAM_desc_t *desc );
 
 int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data );
-void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv );
+void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc );
 
 /**
  *  Internal function to return address of block (m,n) with m,n = block indices
diff --git a/control/descriptor_ipiv.c b/control/descriptor_ipiv.c
index e9631909b..c3369b7a4 100644
--- a/control/descriptor_ipiv.c
+++ b/control/descriptor_ipiv.c
@@ -12,6 +12,8 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  ***
@@ -73,7 +75,7 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data
     ipiv->mt   = chameleon_ceil( ipiv->m, ipiv->mb );
 
     /* Create runtime specific structure like registering data */
-    RUNTIME_ipiv_create( ipiv );
+    RUNTIME_ipiv_create( ipiv, desc );
 
     return rc;
 }
@@ -91,9 +93,10 @@ int chameleon_ipiv_init( CHAM_ipiv_t *ipiv, const CHAM_desc_t *desc, void *data
  *          The pointer to the ipiv descriptor to destroy.
  *
  */
-void chameleon_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void chameleon_ipiv_destroy( CHAM_ipiv_t       *ipiv,
+                             const CHAM_desc_t *desc )
 {
-    RUNTIME_ipiv_destroy( ipiv );
+    RUNTIME_ipiv_destroy( ipiv, desc );
 }
 
 /**
@@ -162,7 +165,8 @@ int CHAMELEON_Ipiv_Create( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void
  * @retval CHAMELEON_SUCCESS successful exit
  *
  */
-int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr)
+int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr,
+                            const CHAM_desc_t *desc )
 {
     CHAM_context_t *chamctxt;
     CHAM_ipiv_t *ipiv;
@@ -179,7 +183,7 @@ int CHAMELEON_Ipiv_Destroy(CHAM_ipiv_t **ipivptr)
     }
 
     ipiv = *ipivptr;
-    chameleon_ipiv_destroy( ipiv );
+    chameleon_ipiv_destroy( ipiv, desc );
     free(ipiv);
     *ipivptr = NULL;
     return CHAMELEON_SUCCESS;
diff --git a/include/chameleon.h b/include/chameleon.h
index f1d335495..12c295a77 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -18,6 +18,8 @@
  * @author Florent Pruvost
  * @author Philippe Virouleau
  * @author Lionel Eyraud-Dubois
+ * @author Alycia Lisito
+ * @author Loris Lucido
  * @date 2024-03-16
  *
  */
@@ -214,11 +216,16 @@ int  CHAMELEON_Recursive_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flt
                                       blkaddr_fct_t get_blkaddr, blkldd_fct_t get_blkldd,
                                       blkrankof_fct_t get_rankof, void* get_rankof_arg );
 
-int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t **ipivptr, const CHAM_desc_t *desc, void *data );
-int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t **ipivptr );
+int CHAMELEON_Ipiv_Create ( CHAM_ipiv_t       **ipivptr,
+                            const CHAM_desc_t  *desc,
+                            void               *data );
+int CHAMELEON_Ipiv_Destroy( CHAM_ipiv_t       **ipivptr,
+                            const CHAM_desc_t  *desc );
 int CHAMELEON_Ipiv_Flush  ( const CHAM_ipiv_t        *ipiv,
                             const RUNTIME_sequence_t *sequence );
-int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc, int *ipiv, int root );
+int CHAMELEON_Ipiv_Gather( CHAM_ipiv_t *ipivdesc,
+                           int         *ipiv,
+                           int          root );
 void CHAMELEON_Ipiv_Print ( const CHAM_ipiv_t *ipiv );
 
 /**
diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h
index e64390f6c..52993c9a6 100644
--- a/include/chameleon/runtime.h
+++ b/include/chameleon/runtime.h
@@ -18,6 +18,7 @@
  * @author Samuel Thibault
  * @author Philippe Swartvagher
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -717,8 +718,10 @@ void RUNTIME_ddisplay_oneprofile (cham_tasktype_t task);
 void RUNTIME_sdisplay_allprofile ();
 void RUNTIME_sdisplay_oneprofile (cham_tasktype_t task);
 
-void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv );
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv );
+void RUNTIME_ipiv_create ( CHAM_ipiv_t *ipiv,
+                          const CHAM_desc_t *desc );
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv,
+                           const CHAM_desc_t *desc );
 void RUNTIME_ipiv_gather ( const RUNTIME_sequence_t *sequence,
                            CHAM_ipiv_t *desc, int *ipiv, int node );
 
@@ -730,18 +733,18 @@ void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m );
 
 void *RUNTIME_ipiv_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h );
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h );
 void *RUNTIME_perm_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
 void *RUNTIME_invp_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
 
 static inline void *
-RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) {
+RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int rank, int k, int h ) {
     if ( h%2 == 0 ) {
-        return RUNTIME_nextpiv_getaddr( ipiv, m, -1 );
+        return RUNTIME_nextpiv_getaddr( ipiv, rank, k, h );
     }
     else {
-        return RUNTIME_prevpiv_getaddr( ipiv, m, -1 );
+        return RUNTIME_prevpiv_getaddr( ipiv, rank, k, h );
     }
 }
 
diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index aa21e99d8..99d70dbad 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -16,6 +16,7 @@
  * @author Cedric Augonnet
  * @author Florent Pruvost
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -165,7 +166,7 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options,
 void INSERT_TASK_ipiv_init   ( const RUNTIME_option_t *options,
                                CHAM_ipiv_t *ipiv );
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ws, int k, int h );
+                               CHAM_ipiv_t *ws, int k, int h, int rank );
 void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
                                int m0, int m, int k,
                                const CHAM_ipiv_t *ipivdesc, int ipivk );
diff --git a/runtime/starpu/codelets/codelet_ipiv.c b/runtime/starpu/codelets/codelet_ipiv.c
index 64e603139..e5dba252a 100644
--- a/runtime/starpu/codelets/codelet_ipiv.c
+++ b/runtime/starpu/codelets/codelet_ipiv.c
@@ -12,6 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-16
  *
  */
@@ -62,13 +63,13 @@ void INSERT_TASK_ipiv_init( const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_ipiv_reducek( const RUNTIME_option_t *options,
-                               CHAM_ipiv_t *ipiv, int k, int h )
+                               CHAM_ipiv_t *ipiv, int k, int h, int rank )
 {
-    starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, k, h-1 );
+    starpu_data_handle_t prevpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h-1 );
 
 #if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI)
 #if !defined(HAVE_STARPU_MPI_REDUX_WRAPUP)
-    starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, k, h   );
+    starpu_data_handle_t nextpiv = RUNTIME_pivot_getaddr( ipiv, rank, k, h   );
     if ( h < ipiv->n ) {
         starpu_mpi_redux_data_prio_tree( options->sequence->comm, nextpiv,
                                          options->priority, 2 /* Binary tree */ );
diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c
index 1d4cb37da..1ead5ec17 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_batched.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c
@@ -43,15 +43,16 @@ cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
     struct cl_getrf_batched_args_t *clargs  = (struct cl_getrf_batched_args_t *) cl_arg;
-    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[0];
-    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[1];
+    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ];
+    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ];
     int                             i, m, n, h, m0, lda;
     CHAM_tile_t                    *tileA;
 
     nextpiv->h = clargs->h;
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag );
 
     for ( i = 0; i < clargs->tasks_nbr; i++ ) {
-        tileA = cti_interface_get( descr[ i + 2 ] );
+        tileA = cti_interface_get( descr[ i ] );
         lda   = tileA->ld;
         m     = clargs->m[ i ];
         n     = clargs->n[ i ];
@@ -77,6 +78,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
     int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->get_rankof( A, Am, An );
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -85,6 +87,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
 
     if ( clargs == NULL ) {
         clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
+        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
         clargs->tasks_nbr   = 0;
         clargs->h           = h;
         clargs->cl_name     = "zgetrf_panel_offdiag_batched";
@@ -104,13 +107,15 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
                                               A->get_blktile( A, Am, An ) );
 
     if ( clargs->tasks_nbr == batch_size ) {
+        int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+        int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
         rt_starpu_insert_task(
             &cl_zgetrf_panel_offdiag_batched,
             /* Task codelet arguments */
             STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-            STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, h   ),
-            STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
             STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -132,18 +137,21 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
 {
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->myrank;
 
     if ( clargs == NULL ) {
         return;
     }
+    int access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( clargs->h == 0 )       ? STARPU_NONE : STARPU_R;
 
     rt_starpu_insert_task(
         &cl_zgetrf_panel_offdiag_batched,
         /* Task codelet arguments */
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-        STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, clargs->h   ),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, clargs->h-1 ),
         STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -162,20 +170,27 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
     struct cl_getrf_batched_args_t *clargs  = ( struct cl_getrf_batched_args_t * ) cl_arg;
-    int                            *ipiv    = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr]);
-    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
-    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 2];
+    int                            *ipiv;
+    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ];
+    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
     int                             i, h, ib;
     CHAM_tile_t                    *tileA, *tileU;
     CHAMELEON_Complex64_t          *U   = NULL;
     int                             ldu = -1;
 
     nextpiv->h = clargs->h;
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
     h  = clargs->h;
     ib = clargs->ib;
     i  = 0;
     if ( clargs->diag ) {
+        if ( h == 0 ) {
+            ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 1]);
+        }
+        else {
+            ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 2]);
+        }
         if ( h != 0 ) {
             tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] );
             U     = CHAM_tile_get_ptr( tileU );
@@ -190,7 +205,7 @@ cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
         i++;
     }
     if ( ( h%ib == 0 ) && ( h > 0 ) ) {
-        tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] );
+        tileU = cti_interface_get( descr[ clargs->tasks_nbr + 2 + clargs->diag ] );
         U     = CHAM_tile_get_ptr( tileU );
         ldu   = tileU->ld;
     }
@@ -225,6 +240,7 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
     void (*callback)(void*) = NULL;
     int accessU, access_npiv, access_ipiv, access_ppiv;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->get_rankof(A, Am, An);
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -232,7 +248,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     if ( clargs == NULL ) {
-        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
+        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) );
+        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
         clargs->tasks_nbr         = 0;
         clargs->diag              = ( Am == An );
         clargs->ib                = ib;
@@ -271,24 +288,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
         }
         /* If there isn't a diag task then use offdiag access */
         if ( clargs->diag == 0 ) {
-            accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+            accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+            access_ipiv = STARPU_NONE;
         }
 
         rt_starpu_insert_task(
             &cl_zgetrf_panel_blocked_batched,
             /* Task codelet arguments */
             STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+            access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+            accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
             STARPU_NAME,              clargs->cl_name,
 #endif
-            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-            access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-            access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h ),
-            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-            accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
             0);
 
         /* clargs is freed by starpu. */
@@ -306,6 +324,7 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     int accessU, access_npiv, access_ipiv, access_ppiv;
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
+    int rankA = A->myrank;
 
     if ( clargs == NULL ) {
         return;
@@ -328,24 +347,25 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     }
     /* If there isn't a diag task then use offdiag access */
     if ( clargs->diag == 0 ) {
-        accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE;
+        accessU     = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE;
+        access_ipiv = STARPU_NONE;
     }
 
     rt_starpu_insert_task(
         &cl_zgetrf_panel_blocked_batched,
         /* Task codelet arguments */
         STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+        STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              clargs->cl_name,
 #endif
-        STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, clargs->h ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, clargs->h - 1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
         0);
 
     /* clargs is freed by starpu. */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
index 2c6daa18d..d11d27365 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
@@ -14,6 +14,7 @@
  *
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-11
  * @precisions normal z -> c d s
  *
@@ -67,6 +68,7 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
     nextpiv->h        = h;
     nextpiv->has_diag = 1;
 
+    coreblas_kernel_trace( tileA );
     CORE_zgetrf_panel_diag( m, n, h, m0, ib,
                             CHAM_tile_get_ptr( tileA ), tileA->ld,
                             U, ldu,
@@ -95,6 +97,7 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag;
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL;
     const char *cl_name = "zgetrf_blocked_diag";
+    int rankA           = A->get_rankof(A, Am, An);
 
     int access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
@@ -130,25 +133,24 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         0);
 }
 
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                    m, n, h, m0, ib;
+    int                    m, n, h, k, m0, ib;
     RUNTIME_sequence_t    *sequence;
     RUNTIME_request_t     *request;
     CHAM_tile_t           *tileA;
@@ -156,9 +158,9 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
     cppi_interface_t      *nextpiv;
     cppi_interface_t      *prevpiv;
     CHAMELEON_Complex64_t *U   = NULL;
-    int                    ldu = -1;;
+    int                    ldu = -1;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, &sequence, &request );
+    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request );
 
     tileA   = cti_interface_get(descr[0]);
     nextpiv = (cppi_interface_t*) descr[1];
@@ -169,12 +171,28 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
         ldu   = tileU->ld;
     }
 
+    if ( h > 0 ) {
+        cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " );
+    }
+    if ( h < tileA->n ) {
+        cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " );
+    }
+
     nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
+    coreblas_kernel_trace( tileA );
     CORE_zgetrf_panel_offdiag( m, n, h, m0, ib,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
                                U, ldu,
                                &(nextpiv->pivot), &(prevpiv->pivot) );
+
+    if ( h > 0 ) {
+        cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " );
+    }
+    if ( h < tileA->n ) {
+        cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " );
+    }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -190,9 +208,11 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
                                          CHAM_ipiv_t *ipiv )
 {
     struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag;
+
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
     int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
     int accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+    int rankA       = A->get_rankof(A, Am, An);
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_blocked_offdiag";
@@ -200,6 +220,9 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW( A, Am, An );
+    if ((h%ib == 0) && (h > 0)) {
+        CHAMELEON_ACCESS_R( U, Um, Un );
+    }
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
@@ -211,21 +234,21 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &m,                   sizeof(int),
         STARPU_VALUE,             &n,                   sizeof(int),
         STARPU_VALUE,             &h,                   sizeof(int),
+        STARPU_VALUE,             &An,                  sizeof(int),
         STARPU_VALUE,             &m0,                  sizeof(int),
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
-        accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         0);
 }
 
@@ -247,6 +270,8 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg)
     U       = CHAM_tile_get_ptr( tileU );
     ldu     = tileU->ld;
 
+    coreblas_kernel_trace( tileU );
+
     /* Copy the final max line of the block and solve */
     cblas_zcopy( n, prevpiv->pivot.pivrow, 1,
                     U + m - 1, ldu );
@@ -276,6 +301,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL;
     const char *cl_name = "zgetrf_blocked_trsm";
+    int rankU = U->get_rankof(U, Um, Un);
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -293,7 +319,7 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
         STARPU_VALUE,             &h,                   sizeof(int),
         STARPU_VALUE,             &ib,                  sizeof(int),
         STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, Un, h-1 ),
+        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c
index 5d3f83b6c..df2301782 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_percol.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c
@@ -14,6 +14,7 @@
  *
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
  * @date 2024-03-11
  * @precisions normal z -> c d s
  *
@@ -95,8 +96,7 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
-    cl_name = chameleon_codelet_name( cl_name, 1,
-                                      A->get_blktile( A, Am, An ) );
+    cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
         codelet,
@@ -106,17 +106,16 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &m0,                  sizeof(int),
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-        /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */
-        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-        access_npiv,              RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
         0);
 }
 
@@ -137,6 +136,7 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg)
     prevpiv = (cppi_interface_t*) descr[2];
 
     nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
     CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
@@ -159,6 +159,9 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
 
     void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_percol_offdiag";
+    int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+    int rankA       = A->get_rankof(A, Am, An);
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -166,8 +169,7 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     /* Refine name */
-    cl_name = chameleon_codelet_name( cl_name, 1,
-                                      A->get_blktile( A, Am, An ) );
+    cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
         codelet,
@@ -178,8 +180,8 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
         STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
         STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_REDUX,             RUNTIME_pivot_getaddr( ipiv, An, h   ),
-        STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, An, h-1 ),
+        access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
+        access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c
index 48be66e17..1ad0f7a14 100644
--- a/runtime/starpu/control/runtime_descriptor_ipiv.c
+++ b/runtime/starpu/control/runtime_descriptor_ipiv.c
@@ -12,6 +12,8 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
+ * @author Alycia Lisito
+ * @author Florent Pruvost
  * @date 2024-03-16
  *
  */
@@ -20,16 +22,18 @@
 /**
  *  Create ws_pivot runtime structures
  */
-void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_create( CHAM_ipiv_t       *ipiv,
+                          const CHAM_desc_t *desc )
 {
     assert( ipiv );
-    starpu_data_handle_t *handles = calloc( 5 * ipiv->mt, sizeof(starpu_data_handle_t) );
+    size_t                nbhandles = 3 * ipiv->mt + 2 * desc->p;
+    starpu_data_handle_t *handles   = calloc( nbhandles, sizeof(starpu_data_handle_t) );
     ipiv->ipiv    = handles;
     handles += ipiv->mt;
     ipiv->nextpiv = handles;
-    handles += ipiv->mt;
+    handles += desc->p;
     ipiv->prevpiv = handles;
-    handles += ipiv->mt;
+    handles += desc->p;
     ipiv->perm    = handles;
     handles += ipiv->mt;
     ipiv->invp    = handles;
@@ -40,14 +44,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
      */
     {
         chameleon_starpu_tag_init();
-        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 5 );
+        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( nbhandles );
         if ( ipiv->mpitag_ipiv == -1 ) {
             chameleon_fatal_error("RUNTIME_ipiv_create", "Can't pursue computation since no more tags are available for ipiv structure");
             return;
         }
         ipiv->mpitag_nextpiv = ipiv->mpitag_ipiv    + ipiv->mt;
-        ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + ipiv->mt;
-        ipiv->mpitag_perm    = ipiv->mpitag_prevpiv + ipiv->mt;
+        ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + desc->p;
+        ipiv->mpitag_perm    = ipiv->mpitag_prevpiv + desc->p;
         ipiv->mpitag_invp    = ipiv->mpitag_perm    + ipiv->mt;
     }
 #endif
@@ -56,12 +60,14 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
 /**
  *  Destroy ws_pivot runtime structures
  */
-void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
+void RUNTIME_ipiv_destroy( CHAM_ipiv_t       *ipiv,
+                           const CHAM_desc_t *desc )
 {
     int                   i;
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv);
+    size_t                nbhandles = 3 * ipiv->mt + 2 * desc->p;
 
-    for(i=0; i<(5 * ipiv->mt); i++) {
+    for(i=0; i<nbhandles; i++) {
         if ( *handle != NULL ) {
             starpu_data_unregister( *handle );
             *handle = NULL;
@@ -107,49 +113,51 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
     return (void*)(*handle);
 }
 
-void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h )
 {
     starpu_data_handle_t *nextpiv = (starpu_data_handle_t*)(ipiv->nextpiv);
-    int64_t mm = m + (ipiv->i / ipiv->mb);
+    const CHAM_desc_t *A = ipiv->desc;
 
-    nextpiv += mm;
+    nextpiv += rank/A->q;
     assert( nextpiv );
 
     if ( *nextpiv != NULL ) {
         return (void*)(*nextpiv);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
-    int64_t tag   = ipiv->mpitag_nextpiv + mm;
+    int64_t kk    = k + (ipiv->i / ipiv->mb);
+    int     owner = rank;
+    int     ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb;
+    int64_t tag   = ipiv->mpitag_nextpiv + owner/A->q;
 
     cppi_register( nextpiv, A->dtyp, ncols, tag, owner );
 
     assert( *nextpiv );
+    (void)h;
     return (void*)(*nextpiv);
 }
 
-void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int rank, int k, int h )
 {
     starpu_data_handle_t *prevpiv = (starpu_data_handle_t*)(ipiv->prevpiv);
-    int64_t mm = m + (ipiv->i / ipiv->mb);
+    const CHAM_desc_t *A = ipiv->desc;
 
-    prevpiv += mm;
+    prevpiv += rank/A->q;
     assert( prevpiv );
 
     if ( *prevpiv != NULL ) {
         return (void*)(*prevpiv);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
-    int64_t tag   = ipiv->mpitag_prevpiv + mm;
+    int64_t kk    = k + (ipiv->i / ipiv->mb);
+    int     owner = rank;
+    int     ncols = (kk == (A->nt-1)) ? A->n - kk * A->nb : A->nb;
+    int64_t tag   = ipiv->mpitag_prevpiv + owner/A->q;
 
     cppi_register( prevpiv, A->dtyp, ncols, tag, owner );
 
     assert( *prevpiv );
+    (void)h;
     return (void*)(*prevpiv);
 }
 
@@ -212,19 +220,18 @@ void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m )
 }
 
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
-                          const CHAM_ipiv_t *ipiv, int m )
+                          const CHAM_ipiv_t *ipiv, int rank )
 {
     starpu_data_handle_t *handle;
     const CHAM_desc_t *A = ipiv->desc;
-    int64_t mm = m + ( ipiv->i / ipiv->mb );
 
     handle = (starpu_data_handle_t*)(ipiv->nextpiv);
-    handle += mm;
+    handle += rank/A->q;
 
     if ( *handle != NULL ) {
 #if defined(CHAMELEON_USE_MPI)
         starpu_mpi_cache_flush( sequence->comm, *handle );
-        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+        if ( starpu_mpi_data_get_rank( *handle ) == rank )
 #endif
         {
             chameleon_starpu_data_wont_use( *handle );
@@ -232,12 +239,12 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
     }
 
     handle = (starpu_data_handle_t*)(ipiv->prevpiv);
-    handle += mm;
+    handle += rank/A->q;
 
     if ( *handle != NULL ) {
 #if defined(CHAMELEON_USE_MPI)
         starpu_mpi_cache_flush( sequence->comm, *handle );
-        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+        if ( starpu_mpi_data_get_rank( *handle ) == rank )
 #endif
         {
             chameleon_starpu_data_wont_use( *handle );
@@ -246,7 +253,7 @@ void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
 
     (void)sequence;
     (void)ipiv;
-    (void)m;
+    (void)rank;
 }
 
 void RUNTIME_ipiv_flush( const RUNTIME_sequence_t *sequence,
diff --git a/testing/testing_zgetrf.c b/testing/testing_zgetrf.c
index dc978bc6f..4645631a7 100644
--- a/testing/testing_zgetrf.c
+++ b/testing/testing_zgetrf.c
@@ -151,8 +151,8 @@ testing_zgetrf_desc( run_arg_list_t *args, int check )
         CHAMELEON_zgetrf_WS_Free( ws );
     }
 
+    CHAMELEON_Ipiv_Destroy( &descIPIV, descA );
     parameters_desc_destroy( &descA );
-    CHAMELEON_Ipiv_Destroy( &descIPIV );
 
     return hres;
 }
-- 
GitLab