diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 1060432565ee5d581cd82bc74da7c53df0829383..94dee2b5b617dc8d8bb4f247694afdb897226bfa 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -16,7 +16,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  * @precisions normal z -> s d c
  *
  */
@@ -154,6 +154,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
     }
 
     /* Flush temporary data used for the pivoting */
+    INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
     RUNTIME_ipiv_flushk( options->sequence, ipiv, k );
 }
 
@@ -191,20 +192,59 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
 static inline void
 chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
                                  CHAM_desc_t                *A,
+                                 CHAM_ipiv_t                *ipiv,
                                  int                         k,
                                  int                         n,
                                  RUNTIME_option_t           *options )
 {
-    (void)ws;
-    (void)A;
-    (void)k;
-    (void)n;
-    (void)options;
+    switch( ws->alg ) {
+    case ChamGetrfPPiv:
+        chameleon_attr_fallthrough;
+    case ChamGetrfPPivPerColumn:
+    {
+        int m;
+        int tempkm, tempkn, tempnn, minmn;
+
+        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+        minmn  = chameleon_min( tempkm, tempkn );
+
+        /* Extract selected rows into U */
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            A(k, n), U(k, n) );
+
+        /*
+         * perm array is made of size tempkm for the first row espacially.
+         * Otherwise, the final copy back to the tile may copy only a partial tile
+         */
+        INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm,
+                                ipiv, k, A(k, n), U(k, n) );
+
+        for(m=k+1; m<A->mt; m++){
+            /* Extract selected rows into A(k, n) */
+            INSERT_TASK_zlaswp_get( options, m*A->mb, minmn,
+                                    ipiv, k, A(m, n), U(k, n) );
+            /* Copy rows from A(k,n) into their final position */
+            INSERT_TASK_zlaswp_set( options, m*A->mb, minmn,
+                                    ipiv, k, A(k, n), A(m, n) );
+        }
+
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            U(k, n), A(k, n) );
+
+        RUNTIME_data_flush( options->sequence, U(k, n) );
+    }
+    break;
+    default:
+        ;
+    }
 }
 
 static inline void
 chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
                                 CHAM_desc_t                *A,
+                                CHAM_ipiv_t                *ipiv,
                                 int                         k,
                                 int                         n,
                                 RUNTIME_option_t           *options )
@@ -217,7 +257,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
 
-    chameleon_pzgetrf_panel_permute( ws, A, k, n, options );
+    chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
 
     INSERT_TASK_ztrsm(
         options,
@@ -270,7 +310,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
 
         for (n = k+1; n < A->nt; n++) {
             options.priority = A->nt-n;
-            chameleon_pzgetrf_panel_update( ws, A, k, n, &options );
+            chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options );
         }
 
         /* Flush panel k */
@@ -284,11 +324,12 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
     /* Backward pivoting */
     for (k = 1; k < min_mnt; k++) {
         for (n = 0; n < k; n++) {
-            chameleon_pzgetrf_panel_permute( ws, A, k, n, &options );
+            chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
         }
+        RUNTIME_perm_flushk( sequence, IPIV, k );
     }
 
-    /* Initialize IPIV */
+    /* Initialize IPIV with default values if needed */
     if ( (ws->alg == ChamGetrfNoPivPerColumn) ||
          (ws->alg == ChamGetrfNoPiv ) )
     {
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 73c810be2c1f3294583b6599aff49e726f8f049d..98d5f0e08f7d14470a81c1898c0a2d096f73fdf9 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -19,7 +19,7 @@
  * @author Florent Pruvost
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  * @precisions normal z -> s d c
  *
@@ -95,6 +95,15 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
                              A->mt, A->nt * A->nb, A->p, A->q,
                              NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg );
     }
+    else if ( ( ws->alg == ChamGetrfPPiv ) ||
+              ( ws->alg == ChamGetrfPPivPerColumn ) )
+    {
+        chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, A->mb, A->nb, A->mb*A->nb,
+                             A->m, A->n, 0, 0,
+                             A->m, A->n, A->p, A->q,
+                             NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg );
+    }
 
     /* Set ib to 1 if per column algorithm */
     if ( ( ws->alg == ChamGetrfNoPivPerColumn ) ||
@@ -130,7 +139,10 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws )
 {
     struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)user_ws;
 
-    if ( ws->alg == ChamGetrfNoPivPerColumn ) {
+    if ( ( ws->alg == ChamGetrfNoPivPerColumn ) ||
+         ( ws->alg == ChamGetrfPPiv           ) ||
+         ( ws->alg == ChamGetrfPPivPerColumn  ) )
+    {
         chameleon_desc_destroy( &(ws->U) );
     }
     free( ws );
diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index 7f89dc29ed96d240f310ac7032ba98d17fe2d90c..bec6c5aaf474aae765f84ac8f60e6d00c950fc62 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -17,14 +17,14 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Matthieu Kuhn
-#  @date 2022-02-22
+#  @date 2023-08-31
 #
 ###
 
@@ -68,8 +68,9 @@ set(ZSRC
     core_zlanhe.c
     core_zlansy.c
     core_zlantr.c
-    core_zlaset2.c
     core_zlaset.c
+    core_zlaset2.c
+    core_zlaswp.c
     core_zlatro.c
     core_zlauum.c
     core_zpamm.c
@@ -132,9 +133,10 @@ precisions_rules_py(COREBLAS_SRCS_GENERATED "${ZSRC}"
                     PRECISIONS "${CHAMELEON_PRECISION}")
 
 set(COREBLAS_SRCS
-    global.c
-    ${COREBLAS_SRCS_GENERATED}
-    )
+  global.c
+  core_ipiv_to_perm.c
+  ${COREBLAS_SRCS_GENERATED}
+)
 
 # Force generation of sources
 # ---------------------------
diff --git a/coreblas/compute/core_ipiv_to_perm.c b/coreblas/compute/core_ipiv_to_perm.c
new file mode 100644
index 0000000000000000000000000000000000000000..290d1d1f801ebb9e5e29afcb76bdec8cb999f07a
--- /dev/null
+++ b/coreblas/compute/core_ipiv_to_perm.c
@@ -0,0 +1,97 @@
+/**
+ *
+ * @file core_ipiv_to_perm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon core_ipiv_to_perm CPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ */
+#include "coreblas.h"
+
+/**
+ *******************************************************************************
+ *
+ * The idea here is to generate a permutation from the sequence of
+ * pivot.  To avoid storing one whole column at each step, we keep
+ * track of two vectors of nb elements, the first one contains the
+ * permutation of the first nb elements, and the second one contains
+ * the inverse permutation of those same elements.
+ *
+ * Lets have i the element to pivot with ip. ipiv[i] = ip;
+ * We set i_1 as such invp[ i_1  ] = i
+ *  and  ip_1 as such invp[ ip_1 ] = ip
+ *
+ * At each step we want to:
+ *   - swap perm[i] and perm[ip]
+ *   - set invp[i_1] to ip
+ *   - set invp[ip_1] to i
+ *
+ *******************************************************************************
+ *
+ * @param[in] m0
+ *          The base index for all values in ipiv, perm and invp. m0 >= 0.
+ *
+ * @param[in] m
+ *          The number of elements in perm and invp. m >= 0.
+ *
+ * @param[in] k
+ *          The number of elements in ipiv. k >= 0.
+ *
+ * @param[in] ipiv
+ *          The pivot array of size n. This is a (m0+1)-based indices array to follow
+ *          the Fortran standard.
+ *
+ * @param[out] perm
+ *          The permutation array of the destination row indices (m0-based) of the [1,n] set of rows.
+ *
+ * @param[out] invp
+ *          The permutation array of the origin row indices (m0-based) of the [1,n] set of rows.
+ *
+ */
+void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp )
+{
+    int i, j, ip;
+    int i_1, ip_1;
+
+    for(i=0; i < m; i++) {
+        perm[i] = i + m0;
+        invp[i] = i + m0;
+    }
+
+    for(i = 0; i < k; i++) {
+        ip = ipiv[i]-1;
+        assert( ip - m0 >= i );
+
+        if ( ip - m0 > i ) {
+
+            i_1 = perm[i];
+
+            if (ip-m0 < m) {
+                ip_1 = perm[ip-m0];
+                perm[ip-m0] = i_1;
+            } else {
+                ip_1 = ip;
+                for(j=0; j < m; j++) {
+                    if( invp[j] == ip ) {
+                        ip_1 = j + m0;
+                        break;
+                    }
+                }
+            }
+
+            perm[i] = ip_1;
+            i_1  -= m0;
+            ip_1 -= m0;
+
+            if (i_1  < m) invp[i_1 ] = ip;
+            if (ip_1 < m) invp[ip_1] = i + m0;
+        }
+    }
+}
diff --git a/coreblas/compute/core_zgetrf.c b/coreblas/compute/core_zgetrf.c
index 3c65462504d3792ca61a48d423b99b9efff0d89d..3089359dafb7dad2c4472cc0a03b527288e337f9 100644
--- a/coreblas/compute/core_zgetrf.c
+++ b/coreblas/compute/core_zgetrf.c
@@ -19,14 +19,13 @@
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Matthieu Kuhn
- * @date 2023-07-26
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
 #include "coreblas/lapacke.h"
 #include "coreblas.h"
 
-
 int CORE_zgetrf( int m, int n,
                  CHAMELEON_Complex64_t *A, int lda,
                  int *IPIV, int *info )
diff --git a/coreblas/compute/core_zgetrf_panel.c b/coreblas/compute/core_zgetrf_panel.c
index 68911699b39b62aa2e12007048bab72311a620f6..f3467b1a3f750b0d87263fa2acda337ac3c00820 100644
--- a/coreblas/compute/core_zgetrf_panel.c
+++ b/coreblas/compute/core_zgetrf_panel.c
@@ -134,13 +134,7 @@ CORE_zgetrf_panel_diag( int m, int n, int h, int m0,
             cblas_zscal( m-h, CBLAS_SADDR( alpha ), L, 1 );
         }
 
-        /*
-         * h is compared only to n, because if we are on the last column of a
-         * tile, m might be much smaller than n, and still we need to apply
-         * the geru call. If this is the diagonal tile, we will just look for
-         * the next maximum for nothing.
-         */
-        if ( h < n ) {
+        if ( h < chameleon_min( m, n ) ) {
             /* Applying the update */
             cblas_zgeru(CblasColMajor, m-h, n-h,
                         CBLAS_SADDR(mzone),
diff --git a/coreblas/compute/core_zlaswp.c b/coreblas/compute/core_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..e28c82b1785c778b704eccab116b2be81bd93859
--- /dev/null
+++ b/coreblas/compute/core_zlaswp.c
@@ -0,0 +1,223 @@
+/**
+ *
+ * @file core_zlaswp.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon core_zgetrf with partial pivoting CPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "coreblas/lapacke.h"
+#include "coreblas.h"
+
+/**
+ ******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * CORE_zlaswp_get extracts the rows from the tile B that have been selected as
+ * pivot into the tile A.
+ *
+ *******************************************************************************
+ *
+ * @param[in] m0
+ *         The index of the first row of the tile A into the larger matrix it
+ *         belongs to.
+ *
+ * @param[in] m
+ *          The number of rows of the matrix A.
+ *
+ * @param[in] n
+ *         The number of columns of the matrices A and B.
+ *
+ * @param[in] k
+ *         The number of rows of the matrix B. This is the number of potential
+ *         pivot that can be extracted from A.
+ *
+ * @param[in] A
+ *          On entry, the matrix A of dimension lda-by-n where to extract the
+ *          pivot rows if some are selected in the range m0..m0+m.
+ *
+ * @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,m).
+ *
+ * @param[inout] B
+ *          On entry, a matrix of size ldb-by-n with 0s or already collected
+ *          rows.
+ *          On exit, B is filled with the selected rows from A, such that for
+ *          each row i, B[i] = A[perm[i]-m0-1].
+ *
+ * @param[in] ldb
+ *          The leading dimension of the array B. ldb >= max(1,k).
+ *
+ * @param[in] perm
+ *          The permutation array of dimension k.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
+int
+CORE_zlaswp_get( int m0, int m, int n, int k,
+                 const CHAMELEON_Complex64_t *A, int lda,
+                 CHAMELEON_Complex64_t       *B, int ldb,
+                 const int *perm )
+{
+    int i;
+
+    /* Check input arguments */
+    if (m0 < 0) {
+        coreblas_error(1, "Illegal value of m0");
+        return -1;
+    }
+    if (m < 0) {
+        coreblas_error(2, "Illegal value of m");
+        return -2;
+    }
+    if (n < 0) {
+        coreblas_error(3, "Illegal value of n");
+        return -3;
+    }
+    if (k < 0) {
+        coreblas_error(4, "Illegal value of k");
+        return -4;
+    }
+    if ((lda < chameleon_max(1,m)) && (m > 0)) {
+        coreblas_error(6, "Illegal value of lda");
+        return -6;
+    }
+    if ((ldb < chameleon_max(1,k)) && (k > 0)) {
+        coreblas_error(8, "Illegal value of ldb");
+        return -8;
+    }
+
+    /* Quick return */
+    if ((m == 0) || (n == 0) || (k == 0)) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    for( i=0; i<k; i++ )
+    {
+        int idx = perm[i] - m0;
+
+        if ( ( idx >= 0 ) && (idx < m ) )
+        {
+            cblas_zcopy( n, A + idx, lda,
+                            B + i,   ldb );
+        }
+    }
+
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ ******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * CORE_zlaswp_set copies the rows from the tile A into the tile B when they are
+ * the destination of the pivoted rows.
+ *
+ *******************************************************************************
+ *
+ * @param[in] m0
+ *         The index of the first row of the tile B into the larger matrix it
+ *         belongs to.
+ *
+ * @param[in] m
+ *          The number of rows of the matrix B.
+ *
+ * @param[in] n
+ *         The number of columns of the matrices A and B.
+ *
+ * @param[in] k
+ *         The number of rows of the matrix A. This is the number of potential
+ *         pivot that can be inserted into B.
+ *
+ * @param[in] A
+ *          On entry, the matrix A of dimension lda-by-n where to read the
+ *          pivoted rows.
+ *
+ * @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,k).
+ *
+ * @param[inout] B
+ *          On entry, a matrix of size ldb-by-n that may require some pivoted rows.
+ *          On exit, B is updated with the pivoted rows it needs to receive, such that for
+ *          each row i, A[i] = B[invp[i]-m0-1].
+ *
+ * @param[in] ldb
+ *          The leading dimension of the array B. ldb >= max(1,m).
+ *
+ * @param[in] invp
+ *          The inverse permutation array of dimension k.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
+int
+CORE_zlaswp_set( int m0, int m, int n, int k,
+                 const CHAMELEON_Complex64_t *A, int lda,
+                 CHAMELEON_Complex64_t *B, int ldb,
+                 const int *invp )
+{
+    int i;
+
+    /* Check input arguments */
+    if (m0 < 0) {
+        coreblas_error(1, "Illegal value of m0");
+        return -1;
+    }
+    if (m < 0) {
+        coreblas_error(2, "Illegal value of m");
+        return -2;
+    }
+    if (n < 0) {
+        coreblas_error(3, "Illegal value of n");
+        return -3;
+    }
+    if (k < 0) {
+        coreblas_error(4, "Illegal value of k");
+        return -4;
+    }
+    if ((lda < chameleon_max(1,k)) && (k > 0)) {
+        coreblas_error(6, "Illegal value of lda");
+        return -6;
+    }
+    if ((ldb < chameleon_max(1,m)) && (m > 0)) {
+        coreblas_error(8, "Illegal value of ldb");
+        return -8;
+    }
+
+    /* Quick return */
+    if ((m == 0) || (n == 0) || (k == 0)) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    for( i=0; i<k; i++ )
+    {
+        int idx = invp[i] - m0;
+
+        if ( ( idx >= 0 ) && (idx < m ) )
+        {
+            cblas_zcopy( n, A + i,   lda,
+                            B + idx, ldb );
+        }
+    }
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c
index 6de9b2c63cd546ddb8216fa0f5c0bdfd522fdb1d..0383290fa13b9b02bb321c939d73fa72720ad2ad 100644
--- a/coreblas/compute/core_ztile.c
+++ b/coreblas/compute/core_ztile.c
@@ -9,11 +9,11 @@
  *
  * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one.
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Florent Pruvost
  * @author Alycia Lisito
- * @date 2022-02-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -464,6 +464,24 @@ TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CH
     CORE_zlaset2( uplo, n1, n2, alpha, CHAM_tile_get_ptr( A ), A->ld );
 }
 
+int
+TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm )
+{
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return CORE_zlaswp_get( m0, m, n, k, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, perm );
+}
+
+int
+TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp )
+{
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return CORE_zlaswp_set( m0, m, n, k, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, invp );
+}
+
 int
 TCORE_zlatro( cham_uplo_t        uplo,
               cham_trans_t       trans,
diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c
index c5055a5084520992a023759b3126eb5171f0314a..036c8326311cb1646690e9becc3c539158e33036 100644
--- a/coreblas/compute/core_ztile_empty.c
+++ b/coreblas/compute/core_ztile_empty.c
@@ -9,10 +9,10 @@
  *
  * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one.
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Alycia Lisito
- * @date 2022-02-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -62,6 +62,7 @@ TCORE_zaxpy( int                   M,
     coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -74,6 +75,7 @@ TCORE_zgeadd( __attribute__((unused)) cham_trans_t          trans,
               __attribute__((unused)) CHAM_tile_t *         B )
 {
     coreblas_kernel_trace( A, B );
+    return 0;
 }
 
 int
@@ -88,6 +90,7 @@ TCORE_zgelqt( __attribute__((unused)) int                    M,
     coreblas_kernel_trace( A, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 void
@@ -129,6 +132,7 @@ TCORE_zgeqrt( int                    M,
     coreblas_kernel_trace( A, T );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -137,6 +141,7 @@ TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L
     coreblas_kernel_trace( L, A );
     assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -145,6 +150,7 @@ TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile
     coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -242,6 +248,7 @@ TCORE_zherfb( cham_uplo_t            uplo,
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 #if defined( PRECISION_z ) || defined( PRECISION_c )
@@ -255,6 +262,7 @@ TCORE_zhessq( cham_store_t       storev,
     coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 #endif
 
@@ -348,6 +356,24 @@ TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CH
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
 }
 
+int
+TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm )
+{
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
+}
+
+int
+TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp )
+{
+    coreblas_kernel_trace( A, B );
+    assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
+}
+
 int
 TCORE_zlatro( cham_uplo_t        uplo,
               cham_trans_t       trans,
@@ -359,6 +385,7 @@ TCORE_zlatro( cham_uplo_t        uplo,
     coreblas_kernel_trace( A, B );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 void
@@ -435,6 +462,7 @@ TCORE_zssssm( int                M1,
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( L1->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( L2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 void
@@ -496,6 +524,7 @@ TCORE_zsyssq( cham_store_t       storev,
     coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 #if defined( PRECISION_z ) || defined( PRECISION_c )
@@ -504,6 +533,7 @@ TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A )
 {
     coreblas_kernel_trace( A );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 #endif
 
@@ -521,6 +551,7 @@ TCORE_ztplqt( int                    M,
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -542,6 +573,7 @@ TCORE_ztpmlqt( cham_side_t            side,
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -563,6 +595,7 @@ TCORE_ztpmqrt( cham_side_t            side,
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -579,6 +612,7 @@ TCORE_ztpqrt( int                    M,
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -592,6 +626,7 @@ TCORE_ztradd( cham_uplo_t           uplo,
               CHAM_tile_t *         B )
 {
     coreblas_kernel_trace( A, B );
+    return 0;
 }
 
 void
@@ -648,6 +683,7 @@ TCORE_ztrssq( cham_uplo_t        uplo,
     coreblas_kernel_trace( A, sclssq );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sclssq->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 void
@@ -677,6 +713,7 @@ TCORE_ztsmlq_hetra1( cham_side_t            side,
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -700,6 +737,7 @@ TCORE_ztsmqr_hetra1( cham_side_t            side,
     assert( A2->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -719,6 +757,7 @@ TCORE_ztstrf( int                    M,
     assert( U->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( L->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -738,6 +777,7 @@ TCORE_zunmlq( cham_side_t            side,
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -757,13 +797,15 @@ TCORE_zunmqr( cham_side_t            side,
     assert( V->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( T->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( C->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
-}
+}    return 0;
+
 
 int
 TCORE_zgesum( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sum )
 {
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( sum->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -787,6 +829,7 @@ TCORE_zcesca( int center,
     assert( Di->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
 
 int
@@ -805,4 +848,5 @@ TCORE_zgram( cham_uplo_t        uplo,
     assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( D->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
+    return 0;
 }
diff --git a/coreblas/include/coreblas.h b/coreblas/include/coreblas.h
index f1c461f29b5e3c9ff55ce65caab1a129bda57c6c..10cc8cc280dc0145367b850b33670beffaf93c53 100644
--- a/coreblas/include/coreblas.h
+++ b/coreblas/include/coreblas.h
@@ -11,14 +11,14 @@
  *
  * @brief Chameleon CPU kernels main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Jakub Kurzak
  * @author Hatem Ltaief
  * @author Florent Pruvost
  * @author Guillaume Sylvand
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2022-02-22
+ * @date 2023-08-31
  *
  */
 #ifndef _coreblas_h_
@@ -87,6 +87,8 @@ void __coreblas_kernel_trace( const char *func, ... );
 
 #endif
 
+void CORE_ipiv_to_perm( int m0, int m, int k, int *ipiv, int *perm, int *invp );
+
 END_C_DECLS
 
 #endif /* _coreblas_h_ */
diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h
index 74382a8df5509d1ca8f2a9420fd7415a58e9dfc1..e5e68e2989f11f56517714e2692adb4ab95771a9 100644
--- a/coreblas/include/coreblas/coreblas_z.h
+++ b/coreblas/include/coreblas/coreblas_z.h
@@ -11,7 +11,7 @@
  *
  * @brief Chameleon CPU CHAMELEON_Complex64_t kernels header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @comment This file has been automatically generated
  *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Jakub Kurzak
@@ -22,7 +22,7 @@
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Matthieu Kuhn
- * @date 2022-02-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -178,6 +178,14 @@ void CORE_zlaset2(cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
                   CHAMELEON_Complex64_t *tileA, int ldtilea);
 void CORE_zlaswp(int N, CHAMELEON_Complex64_t *A, int LDA,
                  int I1,  int I2, const int *IPIV, int INC);
+int CORE_zlaswp_get( int m0, int m, int n, int k,
+                     const CHAMELEON_Complex64_t *A, int lda,
+                     CHAMELEON_Complex64_t *B, int ldb,
+                     const int *perm );
+int CORE_zlaswp_set( int m0, int m, int n, int k,
+                     const CHAMELEON_Complex64_t *A, int lda,
+                     CHAMELEON_Complex64_t *B, int ldb,
+                     const int *invp );
 int  CORE_zlaswp_ontile( CHAM_desc_t descA, int i1, int i2, const int *ipiv, int inc);
 int  CORE_zlaswpc_ontile(CHAM_desc_t descA, int i1, int i2, const int *ipiv, int inc);
 int  CORE_zlatro(cham_uplo_t uplo, cham_trans_t trans,
diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h
index 74cd413168d44653f27c408eee7dea77eca603ad..88d80d053b12e792beb61ea38d3a9171b3d8b2eb 100644
--- a/coreblas/include/coreblas/coreblas_ztile.h
+++ b/coreblas/include/coreblas/coreblas_ztile.h
@@ -7,11 +7,11 @@
  *
  * @brief Chameleon CPU kernel CHAM_tile_t interface
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Florent Pruvost
  * @author Alycia Lisito
- * @date 2022-02-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -54,6 +54,8 @@ void TCORE_zlantr( cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int
 int  TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A );
 void TCORE_zlaset( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_tile_t *A );
 void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A );
+int TCORE_zlaswp_get( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *perm );
+int TCORE_zlaswp_set( int m0, int m, int n, int k, CHAM_tile_t *A, CHAM_tile_t *B, const int *invp );
 int  TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B );
 void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A );
 #if defined(PRECISION_z) || defined(PRECISION_c)
diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h
index a8aaaef56a42b2dbaa25664d86022c51c2f4cd09..2655ef34669642404a79a566db7f0020323f9c65 100644
--- a/include/chameleon/runtime.h
+++ b/include/chameleon/runtime.h
@@ -18,7 +18,7 @@
  * @author Samuel Thibault
  * @author Philippe Swartvagher
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #ifndef _chameleon_runtime_h_
@@ -710,9 +710,11 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv );
 void RUNTIME_ipiv_init   ( CHAM_ipiv_t *ipiv );
 void RUNTIME_ipiv_gather ( CHAM_ipiv_t *desc, int *ipiv, int node );
 
-void *RUNTIME_ipiv_getaddr   ( CHAM_ipiv_t *ipiv, int m );
-void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h );
-void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h );
+void *RUNTIME_ipiv_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h );
+void *RUNTIME_perm_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
+void *RUNTIME_invp_getaddr   ( const CHAM_ipiv_t *ipiv, int m );
 
 static inline void *
 RUNTIME_pivot_getaddr( CHAM_ipiv_t *ipiv, int m, int h ) {
@@ -730,6 +732,8 @@ void RUNTIME_ipiv_flush  ( const CHAM_ipiv_t *ipiv,
                            const RUNTIME_sequence_t *sequence );
 void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options,
                            CHAM_ipiv_t *ws, int k, int h );
+void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
+                          const CHAM_ipiv_t *ipiv, int m );
 
 /**
  * @}
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index d7dd07f48dc6fad1cff9359bf3410defdd657357..e995dcbe18321d0b9bfef2c375318cee8c336b1b 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -19,7 +19,7 @@
  * @author Samuel Thibault
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #ifndef _chameleon_struct_h_
@@ -143,13 +143,17 @@ struct chameleon_desc_s {
 typedef struct chameleon_piv_s {
     const CHAM_desc_t *desc;   /**> Reference descriptor to compute data mapping based on diagonal tiles,
                               and get floating reference type                                        */
-    int    *data;        /**> Pointer to the data                                                    */
-    void   *ipiv;        /**> Opaque array of pointers for the runtimes to handle the ipiv array     */
-    void   *nextpiv;     /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */
-    void   *prevpiv;     /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */
+    int    *data;    /**> Pointer to the data                                                    */
+    void   *ipiv;    /**> Opaque array of pointers for the runtimes to handle the ipiv array     */
+    void   *nextpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */
+    void   *prevpiv; /**> Opaque array of pointers for the runtimes to handle the pivot computation structure */
+    void   *perm;    /**> Opaque array of pointers for the runtimes to handle the temporary permutation array */
+    void   *invp;    /**> Opaque array of pointers for the runtimes to handle the temporary inverse permutation array */
     int64_t mpitag_ipiv;    /**> Initial mpi tag values for the ipiv handles    */
     int64_t mpitag_nextpiv; /**> Initial mpi tag values for the nextpiv handles */
     int64_t mpitag_prevpiv; /**> Initial mpi tag values for the prevpiv handles */
+    int64_t mpitag_perm;    /**> Initial mpi tag values for the nextpiv handles */
+    int64_t mpitag_invp;    /**> Initial mpi tag values for the prevpiv handles */
     int     i;              /**> row index to the beginning of the submatrix    */
     int     m;              /**> The number of row in the vector ipiv           */
     int     mb;             /**> The number of row per block                    */
diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index bc7a59e6f0b36ae28602218d6579c1f47ddbf142..e4131d38409d99f13ffb05c50d4345cb4976078e 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -15,7 +15,8 @@
  * @author Mathieu Faverge
  * @author Cedric Augonnet
  * @author Florent Pruvost
- * @date 2023-07-06
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
  *
  */
 #ifndef _chameleon_tasks_h_
@@ -121,6 +122,10 @@ void INSERT_TASK_hgemm( const RUNTIME_option_t *options,
                                                   const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Real16_t beta,  const CHAM_desc_t *C, int Cm, int Cn );
 
+void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
+                               int m0, int m, int k,
+                               const CHAM_ipiv_t *ipivdesc, int ipivk );
+
 #include "chameleon/tasks_z.h"
 #include "chameleon/tasks_d.h"
 #include "chameleon/tasks_c.h"
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index c5704884e1e11331008519bff1a2b955fd6e4321..c5fdfdf4eb7bbbb1374dac51452782b37858c84b 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -24,7 +24,7 @@
  * @author Alycia Lisito
  * @author Romain Peressoni
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -186,6 +186,16 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
 void INSERT_TASK_zlaset2( const RUNTIME_option_t *options,
                           cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
                           const CHAM_desc_t *tileA, int tileAm, int tileAn );
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *tIPIV, int tIPIVk,
+                             const CHAM_desc_t *tileA, int tileAm, int tileAn,
+                             const CHAM_desc_t *tileB, int tileBm, int tileBn );
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *tIPIV, int tIPIVk,
+                             const CHAM_desc_t *tileA, int tileAm, int tileAn,
+                             const CHAM_desc_t *tileB, int tileBm, int tileBn );
 void INSERT_TASK_zlatro( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
                          const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index e63a4dd5e7203333b4890a2aa09c27f71fda66c4..f011e6d9693672653d8301a4743b314b65ed8d2b 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -24,7 +24,7 @@
 #  @author Florent Pruvost
 #  @author Philippe Virouleau
 #  @author Matthieu Kuhn
-#  @date 2023-08-22
+#  @date 2023-08-31
 #
 ###
 
@@ -76,8 +76,9 @@ set(CODELETS_ZSRC
     codelets/codelet_zlanhe.c
     codelets/codelet_zlansy.c
     codelets/codelet_zlantr.c
-    codelets/codelet_zlaset2.c
     codelets/codelet_zlaset.c
+    codelets/codelet_zlaset2.c
+    codelets/codelet_zlaswp.c
     codelets/codelet_zlatro.c
     codelets/codelet_zlauum.c
     codelets/codelet_zplghe.c
@@ -124,6 +125,7 @@ set(CODELETS_ZSRC
 
 set(CODELETS_SRC
   codelets/codelet_map.c
+  codelets/codelet_ipiv_to_perm.c
 )
 
 # Check for the subdirectories
diff --git a/runtime/openmp/codelets/codelet_ipiv_to_perm.c b/runtime/openmp/codelets/codelet_ipiv_to_perm.c
new file mode 100644
index 0000000000000000000000000000000000000000..c2fb60bccb08f7b1eaf980b2a0810cbf628254e3
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_ipiv_to_perm.c
@@ -0,0 +1,37 @@
+/**
+ *
+ * @file openmp/codelet_ipiv_to_perm.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon OpenMP codelets to convert pivot to permutations
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks.h"
+#include "coreblas.h"
+
+void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
+                               int m0, int m, int k,
+                               const CHAM_ipiv_t *ipivdesc, int ipivk )
+{
+    int *ipiv = NULL; // get pointer from ipivdesc
+    int *perm = NULL; // get pointer from ipivdesc
+    int *invp = NULL; // get pointer from ipivdesc
+
+#pragma omp task firstprivate( m0, m, k ) depend( in:ipiv[0] ) depend( inout:perm[0] ) depend( inout:invp[0] )
+    {
+        CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp );
+    }
+
+    (void)options;
+    (void)ipivk;
+}
diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..452b73926bc301f9f4f7a1d4ac10ace129a46490
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zlaswp.c
@@ -0,0 +1,62 @@
+/**
+ *
+ * @file openmp/codelet_zlaswp.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon OpenMP codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileU = U->get_blktile( U, Um, Un );
+    int         *perm  = NULL; // get perm from ipiv
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileU->format & CHAMELEON_TILE_FULLRANK );
+
+#pragma omp task firstprivate( m0, k, ipiv, tileA, tileU ) depend( in:perm ) depend( in:tileA[0] ) depend( inout:tileU[0] )
+    {
+        TCORE_zlaswp_get( m0, A->m, A->n, k, tileA, tileU, perm );
+    }
+
+    (void)options;
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+    int         *invp  = NULL; // get invp from ipiv
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+#pragma omp task firstprivate( m0, k, ipiv, tileA, tileB ) depend( in:invp ) depend( in:tileA[0] ) depend( inout:tileB[0] )
+    {
+        TCORE_zlaswp_set( m0, A->m, A->n, k, tileA, tileB, invp );
+    }
+
+    (void)options;
+}
diff --git a/runtime/openmp/control/runtime_descriptor_ipiv.c b/runtime/openmp/control/runtime_descriptor_ipiv.c
index 03886ca650340279207c8163bc30eac81f4a1054..f10c4156d83f3e50d4b523f3942b0757475b913f 100644
--- a/runtime/openmp/control/runtime_descriptor_ipiv.c
+++ b/runtime/openmp/control/runtime_descriptor_ipiv.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #include "chameleon_openmp.h"
@@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
     (void)ipiv;
 }
 
-void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
+void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
 {
     assert( 0 );
     (void)ipiv;
@@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
+void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
+void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m )
 {
@@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t        *ipiv,
     (void)sequence;
 }
 
+void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
+                          const CHAM_ipiv_t *ipiv, int m )
+{
+    assert( 0 );
+    (void)sequence;
+    (void)ipiv;
+    (void)m;
+}
+
 void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options,
                            CHAM_ipiv_t *ipiv, int k, int h )
 {
diff --git a/runtime/parsec/codelets/codelet_ipiv_to_perm.c b/runtime/parsec/codelets/codelet_ipiv_to_perm.c
new file mode 100644
index 0000000000000000000000000000000000000000..9a972d879ede836a34dd66017274cc50e71792ee
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_ipiv_to_perm.c
@@ -0,0 +1,50 @@
+/**
+ *
+ * @file parsec/codelet_ipiv_to_perm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon Parsec codelets to convert pivot to permutations
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks.h"
+#include "coreblas.h"
+
+static inline int
+CORE_ipiv_to_perm_parsec( parsec_execution_stream_t *context,
+                          parsec_task_t             *this_task )
+{
+    int m0, m, k;
+    int *ipiv, *perm, *invp;
+
+    parsec_dtd_unpack_args(
+        this_task, &m0, &m, &k, &ipiv, &perm, &invp );
+
+    CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp );
+}
+
+void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
+                               int m0, int m, int k,
+                               const CHAM_ipiv_t *ipivdesc, int ipivk )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_ipiv_to_perm_parsec, options->priority, "ipiv_to_perm",
+        sizeof(int),         &m0,           VALUE,
+        sizeof(int),         &m,            VALUE,
+        sizeof(int),         &k,            VALUE,
+        PASSED_BY_REF, RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_ipiv( ipivdesc ) | INPUT,
+        PASSED_BY_REF, RUNTIME_perm_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_perm( ipivdesc ) | OUTPUT,
+        PASSED_BY_REF, RUNTIME_invp_getaddr( ipivdesc, ipivk ), chameleon_parsec_get_arena_index_invp( ipivdesc ) | OUTPUT,
+        PARSEC_DTD_ARG_END );
+}
diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..284c450aaee61dd71603034b0531f0503de2de5a
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zlaswp.c
@@ -0,0 +1,92 @@
+/**
+ *
+ * @file parsec/codelet_zlaswp.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon PaRSEC codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_z.h"
+
+static inline int
+CORE_zlaswp_get_parsec( parsec_execution_stream_t *context,
+                        parsec_task_t             *this_task )
+{
+    int          m0, m, n, k, lda, ldb, *perm;
+    CHAMELEON_Complex64_t *A, *B;
+
+    parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &perm );
+
+    CORE_zlaswp_get( m0, m, n, k, A, lda, B, ldb, perm );
+}
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileU = U->get_blktile( U, Um, Un );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zlaswp_get_parsec, options->priority, "laswp_get",
+        sizeof(int),          &m0,         VALUE,
+        sizeof(int),          &(tileA->m), VALUE,
+        sizeof(int),          &(tileA->n), VALUE,
+        sizeof(int),          &k,          VALUE,
+        PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &(tileA->ld), VALUE,
+        PASSED_BY_REF, RTBLKADDR(U, ChamComplexDouble, Um, Un), chameleon_parsec_get_arena_index( U ) | INOUT,
+        sizeof(int),         &(tileU->ld), VALUE,
+        PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT,
+        PARSEC_DTD_ARG_END );
+}
+
+static inline int
+CORE_zlaswp_set_parsec( parsec_execution_stream_t *context,
+                        parsec_task_t             *this_task )
+{
+    int          m0, m, n, k, lda, ldb, *invp;
+    CHAMELEON_Complex64_t *A, *B;
+
+    parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &invp );
+
+    CORE_zlaswp_set( m0, m, n, k, A, lda, B, ldb, invp );
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zlaswp_set_parsec, options->priority, "laswp_set",
+        sizeof(int),          &m0,         VALUE,
+        sizeof(int),          &(tileB->m), VALUE,
+        sizeof(int),          &(tileB->n), VALUE,
+        sizeof(int),          &k,          VALUE,
+        PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &(tileA->ld), VALUE,
+        PASSED_BY_REF, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), chameleon_parsec_get_arena_index( B ) | INOUT,
+        sizeof(int),         &(tileB->ld), VALUE,
+        PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT,
+        PARSEC_DTD_ARG_END );
+}
diff --git a/runtime/parsec/control/runtime_descriptor_ipiv.c b/runtime/parsec/control/runtime_descriptor_ipiv.c
index 04a0b791139d5c6a247b25630e126d4a3eb467bf..fefb42abf9aaa65f98e2959bf09ca24779c95a7d 100644
--- a/runtime/parsec/control/runtime_descriptor_ipiv.c
+++ b/runtime/parsec/control/runtime_descriptor_ipiv.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #include "chameleon_parsec.h"
@@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
     (void)ipiv;
 }
 
-void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
+void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
 {
     assert( 0 );
     (void)ipiv;
@@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
+void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
+void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m )
 {
@@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t        *ipiv,
     (void)sequence;
 }
 
+void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
+                          const CHAM_ipiv_t *ipiv, int m )
+{
+    assert( 0 );
+    (void)sequence;
+    (void)ipiv;
+    (void)m;
+}
+
 void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options,
                            CHAM_ipiv_t *ipiv, int k, int h )
 {
diff --git a/runtime/parsec/include/chameleon_parsec.h b/runtime/parsec/include/chameleon_parsec.h
index 30518fb809779ec0ba9c5ce45701a1187fb07621..23d19fd3f904ab28638d610e11abd564205fc924 100644
--- a/runtime/parsec/include/chameleon_parsec.h
+++ b/runtime/parsec/include/chameleon_parsec.h
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon PaRSEC runtime header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Reazul Hoque
  * @author Florent Pruvost
  * @author Samuel Thibault
- * @date 2022-02-22
+ * @date 2023-08-31
  *
  */
 #ifndef _chameleon_parsec_h_
@@ -38,10 +38,28 @@ struct chameleon_parsec_desc_s {
 typedef struct chameleon_parsec_desc_s chameleon_parsec_desc_t;
 
 static inline int
-chameleon_parsec_get_arena_index(const CHAM_desc_t *desc) {
+chameleon_parsec_get_arena_index( const CHAM_desc_t *desc ) {
     return ((chameleon_parsec_desc_t *)desc->schedopt)->arena_index;
 }
 
+static inline int
+chameleon_parsec_get_arena_index_ipiv( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
+static inline int
+chameleon_parsec_get_arena_index_perm( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
+static inline int
+chameleon_parsec_get_arena_index_invp( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
 static inline int cham_to_parsec_access( cham_access_t accessA ) {
     if ( accessA == ChamR ) {
         return INPUT;
diff --git a/runtime/quark/codelets/codelet_ipiv_to_perm.c b/runtime/quark/codelets/codelet_ipiv_to_perm.c
new file mode 100644
index 0000000000000000000000000000000000000000..8ccc7ddff26438bfc67e8160c5bb4de8419237f4
--- /dev/null
+++ b/runtime/quark/codelets/codelet_ipiv_to_perm.c
@@ -0,0 +1,48 @@
+/**
+ *
+ * @file quark/codelet_ipiv_to_perm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon Quark codelets to convert pivot to permutations
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks.h"
+#include "coreblas.h"
+
+static inline void
+CORE_ipiv_to_perm_quark( Quark *quark )
+{
+    int m0, m, k;
+    int *ipiv, *perm, *invp;
+
+    quark_unpack_args_6( quark, m0, m, k, ipiv, perm, invp );
+
+    CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp );
+}
+
+void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
+                               int m0, int m, int k,
+                               const CHAM_ipiv_t *ipivdesc, int ipivk )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_ipiv_to_perm_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),  &m0,  VALUE,
+        sizeof(int),  &m,   VALUE,
+        sizeof(int),  &k,   VALUE,
+        sizeof(int*), RUNTIME_ipiv_getaddr( ipivdesc, ipivk ), INPUT,
+        sizeof(int*), RUNTIME_perm_getaddr( ipivdesc, ipivk ), OUTPUT,
+        sizeof(int*), RUNTIME_invp_getaddr( ipivdesc, ipivk ), OUTPUT,
+        0 );
+}
diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..117d6761882bcbd7222a807bc98d3a741d4712a0
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zlaswp.c
@@ -0,0 +1,78 @@
+/**
+ *
+ * @file quark/codelet_zlaswp.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon Quark codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+static void CORE_zlaswp_get_quark( Quark *quark )
+{
+    int          m0, k, *perm;
+    CHAM_tile_t *A, *B;
+
+    quark_unpack_args_5( quark, m0, k, perm, A, B );
+
+    TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm );
+}
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_LASWP;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_zlaswp_get_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),          &m0, VALUE,
+        sizeof(int),          &k,  VALUE,
+        sizeof(int*),         RUNTIME_perm_getaddr( ipiv, ipivk ),     INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT,
+        0 );
+}
+
+static void CORE_zlaswp_set_quark( Quark *quark )
+{
+    int          m0, k, *invp;
+    CHAM_tile_t *A, *B;
+
+    quark_unpack_args_5( quark, m0, k, invp, A, B );
+
+    TCORE_zlaswp_set( m0, A->m, A->n, k, A, B, invp );
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_LASWP;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_zlaswp_set_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),          &m0, VALUE,
+        sizeof(int),          &k,  VALUE,
+        sizeof(int*),         RUNTIME_invp_getaddr( ipiv, ipivk ),     INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT,
+        0 );
+}
diff --git a/runtime/quark/control/runtime_descriptor_ipiv.c b/runtime/quark/control/runtime_descriptor_ipiv.c
index 34706a55518f95f0e4b229a772534e3f062d05d2..88e8f886e8578f99e066868e6dfb2880fc4035d0 100644
--- a/runtime/quark/control/runtime_descriptor_ipiv.c
+++ b/runtime/quark/control/runtime_descriptor_ipiv.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #include "chameleon_quark.h"
@@ -29,7 +29,7 @@ void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
     (void)ipiv;
 }
 
-void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
+void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
 {
     assert( 0 );
     (void)ipiv;
@@ -37,7 +37,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
     return NULL;
 }
 
-void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -46,7 +46,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
-void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     assert( 0 );
     (void)ipiv;
@@ -55,6 +55,22 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return NULL;
 }
 
+void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
+void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int k )
+{
+    assert( 0 );
+    (void)ipiv;
+    (void)k;
+    return NULL;
+}
+
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m )
 {
@@ -72,6 +88,15 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t        *ipiv,
     (void)sequence;
 }
 
+void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
+                          const CHAM_ipiv_t *ipiv, int m )
+{
+    assert( 0 );
+    (void)sequence;
+    (void)ipiv;
+    (void)m;
+}
+
 void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options,
                            CHAM_ipiv_t *ipiv, int k, int h )
 {
diff --git a/runtime/quark/include/chameleon_quark.h b/runtime/quark/include/chameleon_quark.h
index 8e415b7c564c486fa9ffd9f4de9585adc3a9410e..bb454e22972857fae4dbd0840696edfd3a978e59 100644
--- a/runtime/quark/include/chameleon_quark.h
+++ b/runtime/quark/include/chameleon_quark.h
@@ -49,7 +49,7 @@ static inline int cham_to_quark_access( cham_access_t accessA ) {
 /*
  * Access to block pointer and leading dimension
  */
-#define RTBLKADDR( desc, type, m, n ) ( (type*)RUNTIME_data_getaddr( desc, m, n ) )
+#define RTBLKADDR( desc, type, m, n ) ( RUNTIME_data_getaddr( desc, m, n ) )
 
 #define RUNTIME_BEGIN_ACCESS_DECLARATION
 
diff --git a/runtime/starpu/codelets/codelet_ipiv_to_perm.c b/runtime/starpu/codelets/codelet_ipiv_to_perm.c
new file mode 100644
index 0000000000000000000000000000000000000000..31183c11505a0f19fa3505691684c37810c0f10e
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_ipiv_to_perm.c
@@ -0,0 +1,69 @@
+/**
+ *
+ * @file starpu/codelet_ipiv_to_perm.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to convert pivot to permutations
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelets.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_ipiv_to_perm_cpu_func( void *descr[], void *cl_arg )
+{
+    int m0, m, k;
+    int *ipiv, *perm, *invp;
+
+    starpu_codelet_unpack_args( cl_arg, &m0, &m, &k );
+
+    ipiv = (int*)STARPU_VECTOR_GET_PTR(descr[0]);
+    perm = (int*)STARPU_VECTOR_GET_PTR(descr[1]);
+    invp = (int*)STARPU_VECTOR_GET_PTR(descr[2]);
+
+    CORE_ipiv_to_perm( m0, m, k, ipiv, perm, invp );
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+* Codelet definition
+*/
+static struct starpu_codelet cl_ipiv_to_perm = {
+    .where        = STARPU_CPU,
+#if defined(CHAMELEON_SIMULATION)
+    .cpu_funcs[0] = (starpu_cpu_func_t)1,
+#else
+    .cpu_funcs[0] = cl_ipiv_to_perm_cpu_func,
+#endif
+    .nbuffers     = 3,
+    .model        = NULL,
+    .name         = "ipiv_to_perm"
+};
+
+void INSERT_TASK_ipiv_to_perm( const RUNTIME_option_t *options,
+                               int m0, int m, int k,
+                               const CHAM_ipiv_t *ipivdesc, int ipivk )
+{
+    struct starpu_codelet *codelet = &cl_ipiv_to_perm;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,             &m0,  sizeof(int),
+        STARPU_VALUE,             &m,   sizeof(int),
+        STARPU_VALUE,             &k,   sizeof(int),
+        STARPU_R,                 RUNTIME_ipiv_getaddr( ipivdesc, ipivk ),
+        STARPU_W,                 RUNTIME_perm_getaddr( ipivdesc, ipivk ),
+        STARPU_W,                 RUNTIME_invp_getaddr( ipivdesc, ipivk ),
+        STARPU_PRIORITY,          options->priority,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        0 );
+}
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d8fc31d422fa39da13db5f2b2240cd7096d64e3
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -0,0 +1,108 @@
+/**
+ *
+ * @file starpu/codelet_zlaswp.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg )
+{
+    int          m0, k, *perm;
+    CHAM_tile_t *A, *B;
+
+    starpu_codelet_unpack_args( cl_arg, &m0, &k );
+
+    perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
+    A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
+    B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
+
+    TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm );
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func )
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    struct starpu_codelet *codelet = &cl_zlaswp_get;
+
+    //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,               &m0, sizeof(int),
+        STARPU_VALUE,               &k,  sizeof(int),
+        STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
+        STARPU_PRIORITY,            options->priority,
+        //STARPU_CALLBACK,            callback,
+        STARPU_EXECUTE_ON_WORKER,   options->workerid,
+        0 );
+}
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg )
+{
+    int          m0, k, *invp;
+    CHAM_tile_t *A, *B;
+
+    starpu_codelet_unpack_args( cl_arg, &m0, &k );
+
+    invp = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
+    A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
+    B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
+
+    TCORE_zlaswp_set( m0, B->m, B->n, k, A, B, invp );
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func )
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    struct starpu_codelet *codelet = &cl_zlaswp_set;
+
+    //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,             &m0, sizeof(int),
+        STARPU_VALUE,             &k,  sizeof(int),
+        STARPU_R,                 RUNTIME_invp_getaddr( ipiv, ipivk ),
+        STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_RW,                RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+        STARPU_PRIORITY,          options->priority,
+        //STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        0 );
+}
diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c
index 4131f7d6c79858624ed0b324f6785aebfb195d7e..efd5afb3637fb65cb8b0dd49acdf14a5c5bf83a1 100644
--- a/runtime/starpu/control/runtime_descriptor_ipiv.c
+++ b/runtime/starpu/control/runtime_descriptor_ipiv.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  *
  */
 #include "chameleon_starpu.h"
@@ -23,10 +23,16 @@
 void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
 {
     assert( ipiv );
-
-    ipiv->ipiv    = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) );
-    ipiv->nextpiv = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) );
-    ipiv->prevpiv = (void*)calloc( ipiv->mt, sizeof(starpu_data_handle_t) );
+    starpu_data_handle_t *handles = calloc( 5 * ipiv->mt, sizeof(starpu_data_handle_t) );
+    ipiv->ipiv    = handles;
+    handles += ipiv->mt;
+    ipiv->nextpiv = handles;
+    handles += ipiv->mt;
+    ipiv->prevpiv = handles;
+    handles += ipiv->mt;
+    ipiv->perm    = handles;
+    handles += ipiv->mt;
+    ipiv->invp    = handles;
 #if defined(CHAMELEON_USE_MPI)
     /*
      * Book the number of tags required to describe pivot structure
@@ -34,13 +40,15 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
      */
     {
         chameleon_starpu_tag_init();
-        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 3 );
+        ipiv->mpitag_ipiv = chameleon_starpu_tag_book( (int64_t)(ipiv->mt) * 5 );
         if ( ipiv->mpitag_ipiv == -1 ) {
             chameleon_fatal_error("RUNTIME_ipiv_create", "Can't pursue computation since no more tags are available for ipiv structure");
             return;
         }
         ipiv->mpitag_nextpiv = ipiv->mpitag_ipiv    + ipiv->mt;
         ipiv->mpitag_prevpiv = ipiv->mpitag_nextpiv + ipiv->mt;
+        ipiv->mpitag_perm    = ipiv->mpitag_prevpiv + ipiv->mt;
+        ipiv->mpitag_invp    = ipiv->mpitag_perm    + ipiv->mt;
     }
 #endif
 }
@@ -51,37 +59,26 @@ void RUNTIME_ipiv_create( CHAM_ipiv_t *ipiv )
 void RUNTIME_ipiv_destroy( CHAM_ipiv_t *ipiv )
 {
     int                   i;
-    starpu_data_handle_t *ipiv_handle    = (starpu_data_handle_t*)(ipiv->ipiv);
-    starpu_data_handle_t *nextpiv_handle = (starpu_data_handle_t*)(ipiv->nextpiv);
-    starpu_data_handle_t *prevpiv_handle = (starpu_data_handle_t*)(ipiv->prevpiv);
-
-    for(i=0; i<ipiv->mt; i++) {
-        if ( *ipiv_handle != NULL ) {
-            starpu_data_unregister( *ipiv_handle );
-            *ipiv_handle = NULL;
-        }
-        ipiv_handle++;
-
-        if ( *nextpiv_handle != NULL ) {
-            starpu_data_unregister( *nextpiv_handle );
-            *nextpiv_handle = NULL;
-        }
-        nextpiv_handle++;
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv);
 
-        if ( *prevpiv_handle != NULL ) {
-            starpu_data_unregister( *prevpiv_handle );
-            *prevpiv_handle = NULL;
+    for(i=0; i<(5 * ipiv->mt); i++) {
+        if ( *handle != NULL ) {
+            starpu_data_unregister( *handle );
+            *handle = NULL;
         }
-        prevpiv_handle++;
+        handle++;
     }
 
     free( ipiv->ipiv    );
-    free( ipiv->nextpiv );
-    free( ipiv->prevpiv );
+    ipiv->ipiv    = NULL;
+    ipiv->nextpiv = NULL;
+    ipiv->prevpiv = NULL;
+    ipiv->perm    = NULL;
+    ipiv->invp    = NULL;
     chameleon_starpu_tag_release( ipiv->mpitag_ipiv );
 }
 
-void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
+void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
 {
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->ipiv);
     int64_t mm = m + (ipiv->i / ipiv->mb);
@@ -110,7 +107,7 @@ void *RUNTIME_ipiv_getaddr( CHAM_ipiv_t *ipiv, int m )
     return *handle;
 }
 
-void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_nextpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     starpu_data_handle_t *nextpiv = (starpu_data_handle_t*)(ipiv->nextpiv);
     int64_t mm = m + (ipiv->i / ipiv->mb);
@@ -124,7 +121,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
 
     const CHAM_desc_t *A = ipiv->desc;
     int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb;
+    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
     int64_t tag   = ipiv->mpitag_nextpiv + mm;
 
     cppi_register( nextpiv, A->dtyp, ncols, tag, owner );
@@ -133,7 +130,7 @@ void *RUNTIME_nextpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return *nextpiv;
 }
 
-void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
+void *RUNTIME_prevpiv_getaddr( const CHAM_ipiv_t *ipiv, int m, int h )
 {
     starpu_data_handle_t *prevpiv = (starpu_data_handle_t*)(ipiv->prevpiv);
     int64_t mm = m + (ipiv->i / ipiv->mb);
@@ -147,7 +144,7 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
 
     const CHAM_desc_t *A = ipiv->desc;
     int     owner = A->get_rankof( A, m, m );
-    int     ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb;
+    int     ncols = (mm == (A->nt-1)) ? A->n - mm * A->nb : A->nb;
     int64_t tag   = ipiv->mpitag_prevpiv + mm;
 
     cppi_register( prevpiv, A->dtyp, ncols, tag, owner );
@@ -156,6 +153,64 @@ void *RUNTIME_prevpiv_getaddr( CHAM_ipiv_t *ipiv, int m, int h )
     return *prevpiv;
 }
 
+void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int m )
+{
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->perm);
+    int64_t mm = m + (ipiv->i / ipiv->mb);
+
+    handle += mm;
+    assert( handle );
+
+    if ( *handle != NULL ) {
+        return *handle;
+    }
+
+    const CHAM_desc_t *A = ipiv->desc;
+    int owner = A->get_rankof( A, m, m );
+    int ncols = ipiv->mb;
+
+    starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) );
+
+#if defined(CHAMELEON_USE_MPI)
+    {
+        int64_t tag = ipiv->mpitag_perm + mm;
+        starpu_mpi_data_register( *handle, tag, owner );
+    }
+#endif /* defined(CHAMELEON_USE_MPI) */
+
+    assert( *handle );
+    return *handle;
+}
+
+void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m )
+{
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)(ipiv->invp);
+    int64_t mm = m + (ipiv->i / ipiv->mb);
+
+    handle += mm;
+    assert( handle );
+
+    if ( *handle != NULL ) {
+        return *handle;
+    }
+
+    const CHAM_desc_t *A = ipiv->desc;
+    int owner = A->get_rankof( A, m, m );
+    int ncols = ipiv->mb;
+
+    starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) );
+
+#if defined(CHAMELEON_USE_MPI)
+    {
+        int64_t tag = ipiv->mpitag_invp + mm;
+        starpu_mpi_data_register( *handle, tag, owner );
+    }
+#endif /* defined(CHAMELEON_USE_MPI) */
+
+    assert( *handle );
+    return *handle;
+}
+
 void RUNTIME_ipiv_flushk( const RUNTIME_sequence_t *sequence,
                           const CHAM_ipiv_t *ipiv, int m )
 {
@@ -205,6 +260,44 @@ void RUNTIME_ipiv_flush( const CHAM_ipiv_t        *ipiv,
     }
 }
 
+void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
+                          const CHAM_ipiv_t *ipiv, int m )
+{
+    starpu_data_handle_t *handle;
+    const CHAM_desc_t *A = ipiv->desc;
+    int64_t mm = m + ( ipiv->i / ipiv->mb );
+
+    handle = (starpu_data_handle_t*)(ipiv->perm);
+    handle += mm;
+
+    if ( *handle != NULL ) {
+#if defined(CHAMELEON_USE_MPI)
+        starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
+        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+#endif
+        {
+            chameleon_starpu_data_wont_use( *handle );
+        }
+    }
+
+    handle = (starpu_data_handle_t*)(ipiv->invp);
+    handle += mm;
+
+    if ( *handle != NULL ) {
+#if defined(CHAMELEON_USE_MPI)
+        starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
+        if ( starpu_mpi_data_get_rank( *handle ) == A->myrank )
+#endif
+        {
+            chameleon_starpu_data_wont_use( *handle );
+        }
+    }
+
+    (void)sequence;
+    (void)ipiv;
+    (void)m;
+}
+
 void RUNTIME_ipiv_reducek( const RUNTIME_option_t *options,
                            CHAM_ipiv_t *ipiv, int k, int h )
 {
@@ -276,14 +369,38 @@ void RUNTIME_ipiv_gather( CHAM_ipiv_t *desc, int *ipiv, int node )
     int64_t mb   = desc->mb;
     int64_t tag  = chameleon_starpu_tag_book( (int64_t)(desc->mt) );
     int     rank = CHAMELEON_Comm_rank();
+    int     owner = rank;
     int     m;
 
     for (m = 0; m < mt; m++, ipiv += mb) {
         starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( desc, m );
 
 #if defined(CHAMELEON_USE_MPI)
-        if ( (rank == node) ||
-             (rank == starpu_mpi_data_get_rank(ipiv_src)) )
+        owner = starpu_mpi_data_get_rank( ipiv_src );
+        if ( node != owner ) {
+            starpu_mpi_tag_t tag = starpu_mpi_data_get_tag( ipiv_src );
+
+            if ( rank == node )
+            {
+                /* Need to receive the data */
+                int already_received = starpu_mpi_cached_receive_set( ipiv_src );
+                if (already_received == 0)
+                {
+                    MPI_Status status;
+                    starpu_mpi_recv( ipiv_src, owner, tag, MPI_COMM_WORLD, &status );
+                }
+            }
+            else if ( rank == owner )
+            {
+                /* Need to send the data */
+                int already_sent = starpu_mpi_cached_send_set( ipiv_src, node );
+                if (already_sent == 0)
+                {
+                    starpu_mpi_send( ipiv_src, node, tag, MPI_COMM_WORLD );
+                }
+            }
+        }
+        if ( rank == node )
 #endif
         {
             starpu_data_handle_t ipiv_dst;
diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h
index 537bc9cd807c9e27f0cf550d6611e2bc974255d3..7a77784656291b3f2b91ccd265f950a7d8889d8d 100644
--- a/runtime/starpu/include/cppi_interface.h
+++ b/runtime/starpu/include/cppi_interface.h
@@ -82,8 +82,11 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title )
 }
 #else
 static inline void
-cppi_display_dbg( cppi_interface_t *, FILE *, const char * )
+cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title )
 {
+    (void)cppi_interface;
+    (void)f;
+    (void)title;
     return;
 }
 #endif
diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h
index c27d6b913bb231c4815dca09e67b7201e12697c7..72c7edc8cb46cc035e693bd6c653715d91990495 100644
--- a/runtime/starpu/include/runtime_codelets.h
+++ b/runtime/starpu/include/runtime_codelets.h
@@ -27,6 +27,8 @@
 #include "runtime_codelet_profile.h"
 
 #if !defined(CHAMELEON_SIMULATION)
+#include "coreblas.h"
+
 #if defined(CHAMELEON_USE_CUDA)
 #include "gpucublas.h"
 #endif
diff --git a/runtime/starpu/include/runtime_mpi.h b/runtime/starpu/include/runtime_mpi.h
index 6d307bc6ae597ec075caf05c7dcbd382a16c4043..cd9841e10d8994c3652bd1a968df3be6841a7937 100644
--- a/runtime/starpu/include/runtime_mpi.h
+++ b/runtime/starpu/include/runtime_mpi.h
@@ -23,7 +23,7 @@
 #if defined(CHAMELEON_USE_MPI)
 
 #if !defined(HAVE_STARPU_MPI_DATA_REGISTER)
-static inline starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner )
+static inline void starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner )
 {
     starpu_data_set_rank( handle, owner );
     starpu_data_set_tag( handle, tag );
@@ -32,8 +32,11 @@ static inline starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag
 
 #else
 
-static inline starpu_mpi_data_register( starpu_data_handle_t, int64_t, int )
+static inline void starpu_mpi_data_register( starpu_data_handle_t handle, int64_t tag, int owner )
 {
+    (void)handle;
+    (void)tag;
+    (void)owner;
 }
 
 #endif