diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 706f9fbcb92e9b65c410d67cf0372ad9fa0e3755..d95b415d9bbe5473f38be0d3b304c7c3c898adf1 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -447,6 +447,58 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
     }
 }
 
+static inline void
+chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
+                                         CHAM_desc_t                *A,
+                                         CHAM_ipiv_t                *ipiv,
+                                         int                         k,
+                                         int                         n,
+                                         RUNTIME_option_t           *options )
+{
+    switch( ws->alg ) {
+    case ChamGetrfPPiv:
+        chameleon_attr_fallthrough;
+    case ChamGetrfPPivPerColumn:
+    {
+        int m;
+        int tempkm, tempkn, tempnn, minmn;
+        void **clargs = malloc( sizeof(char *) );
+        *clargs = NULL;
+
+        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+        minmn  = chameleon_min( tempkm, tempkn );
+
+        /* Extract selected rows into U */
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            A(k, n), U(k, n) );
+
+        /*
+         * perm array is made of size tempkm for the first row especially.
+         * Otherwise, the final copy back to the tile may copy only a partial tile
+         */
+        INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm,
+                                ipiv, k, A(k, n), U(k, n) );
+
+        for(m=k+1; m<A->mt; m++){
+            INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, k, m, n, (void *)ws,
+                                        ipiv, k, A, &(ws->U), clargs );
+        }
+        INSERT_TASK_zlaswp_batched_flush( options, k, n, ipiv, k, A, &(ws->U), clargs );
+
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            U(k, n), A(k, n) );
+
+        RUNTIME_data_flush( options->sequence, U(k, n) );
+        free( clargs );
+    }
+    break;
+    default:
+        ;
+    }
+}
+
 static inline void
 chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
                                 CHAM_desc_t                *A,
@@ -463,7 +515,12 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
 
-    chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
+    if ( ws->batch_size > 0 ) {
+        chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options );
+    }
+    else {
+        chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
+    }
 
     INSERT_TASK_ztrsm(
         options,
@@ -536,11 +593,21 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
     }
 
     /* Backward pivoting */
-    for (k = 1; k < min_mnt; k++) {
-        for (n = 0; n < k; n++) {
-            chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
+    if ( ws->batch_size > 0 ) {
+        for (k = 1; k < min_mnt; k++) {
+            for (n = 0; n < k; n++) {
+                chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options );
+            }
+            RUNTIME_perm_flushk( sequence, IPIV, k );
+        }
+    }
+    else {
+        for (k = 1; k < min_mnt; k++) {
+            for (n = 0; n < k; n++) {
+                chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
+            }
+            RUNTIME_perm_flushk( sequence, IPIV, k );
         }
-        RUNTIME_perm_flushk( sequence, IPIV, k );
     }
 
     /* Initialize IPIV with default values if needed */
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 93a5f6e303c8b00076e78fbe7faf58fc59dfe4f7..236482682266032654bcc6a8e6050b617134fa98 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -198,6 +198,19 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
                              const CHAM_ipiv_t *tIPIV, int tIPIVk,
                              const CHAM_desc_t *tileA, int tileAm, int tileAn,
                              const CHAM_desc_t *tileB, int tileBm, int tileBn );
+void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
+                                 int m0, int minmn, int k, int m, int n,
+                                 void *ws,
+                                 const CHAM_ipiv_t *ipiv, int ipivk,
+                                 const CHAM_desc_t *A,
+                                 const CHAM_desc_t *U,
+                                 void **clargs_ptr );
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       int k, int n,
+                                       const CHAM_ipiv_t *ipiv, int ipivk,
+                                       const CHAM_desc_t *A,
+                                       const CHAM_desc_t *U,
+                                       void **clargs_ptr );
 void INSERT_TASK_zlatro( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
                          const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 08279345b7f5d95ba633f3143f7c1b39fe2d6352..6b24081b2bd7f58f330e28b142f8c714ba208009 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -83,6 +83,7 @@ set(CODELETS_ZSRC
     codelets/codelet_zlaset.c
     codelets/codelet_zlaset2.c
     codelets/codelet_zlaswp.c
+    codelets/codelet_zlaswp_batched.c
     codelets/codelet_zlatro.c
     codelets/codelet_zlauum.c
     codelets/codelet_zplghe.c
diff --git a/runtime/openmp/codelets/codelet_zlaswp_batched.c b/runtime/openmp/codelets/codelet_zlaswp_batched.c
new file mode 100644
index 0000000000000000000000000000000000000000..49ac5381ca1d1e4fe3bfb562811675b3d909765b
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zlaswp_batched.c
@@ -0,0 +1,65 @@
+/**
+ *
+ * @file openmp/codelet_zlaswp_batched.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon OpenMP codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-10-21
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+
+void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
+                                 int                     m0,
+                                 int                     minmn,
+                                 int                     k,
+                                 int                     m,
+                                 int                     n,
+                                 void                   *ws,
+                                 const CHAM_ipiv_t      *ipiv,
+                                 int                     ipivk,
+                                 const CHAM_desc_t      *A,
+                                 const CHAM_desc_t      *Wu,
+                                 void                  **clargs_ptr )
+{
+    (void)options;
+    (void)m0;
+    (void)minmn;
+    (void)k;
+    (void)m;
+    (void)n;
+    (void)ws;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)Wu;
+    (void)clargs_ptr;
+}
+
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       int                     k,
+                                       int                     n,
+                                       const CHAM_ipiv_t      *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *U,
+                                       void                  **clargs_ptr )
+{
+    (void)options;
+    (void)k;
+    (void)n;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)U;
+    (void)clargs_ptr;
+}
diff --git a/runtime/parsec/codelets/codelet_zlaswp_batched.c b/runtime/parsec/codelets/codelet_zlaswp_batched.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa8726690b25d23b6cdd3ea6ff525b9c36be12d3
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zlaswp_batched.c
@@ -0,0 +1,65 @@
+/**
+ *
+ * @file parsec/codelet_zlaswp_batched.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon Parsec codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-10-21
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+
+void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
+                                 int                     m0,
+                                 int                     minmn,
+                                 int                     k,
+                                 int                     m,
+                                 int                     n,
+                                 void                   *ws,
+                                 const CHAM_ipiv_t      *ipiv,
+                                 int                     ipivk,
+                                 const CHAM_desc_t      *A,
+                                 const CHAM_desc_t      *Wu,
+                                 void                  **clargs_ptr )
+{
+    (void)options;
+    (void)m0;
+    (void)minmn;
+    (void)k;
+    (void)m;
+    (void)n;
+    (void)ws;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)Wu;
+    (void)clargs_ptr;
+}
+
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       int                     k,
+                                       int                     n,
+                                       const CHAM_ipiv_t      *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *U,
+                                       void                  **clargs_ptr )
+{
+    (void)options;
+    (void)k;
+    (void)n;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)U;
+    (void)clargs_ptr;
+}
diff --git a/runtime/quark/codelets/codelet_zlaswp_batched.c b/runtime/quark/codelets/codelet_zlaswp_batched.c
new file mode 100644
index 0000000000000000000000000000000000000000..f96414f27d29f448b7856d1e913e42cc4e15fcff
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zlaswp_batched.c
@@ -0,0 +1,65 @@
+/**
+ *
+ * @file quark/codelet_zlaswp_batched.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon quark codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-10-21
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+
+void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
+                                 int                     m0,
+                                 int                     minmn,
+                                 int                     k,
+                                 int                     m,
+                                 int                     n,
+                                 void                   *ws,
+                                 const CHAM_ipiv_t      *ipiv,
+                                 int                     ipivk,
+                                 const CHAM_desc_t      *A,
+                                 const CHAM_desc_t      *Wu,
+                                 void                  **clargs_ptr )
+{
+    (void)options;
+    (void)m0;
+    (void)minmn;
+    (void)k;
+    (void)m;
+    (void)n;
+    (void)ws;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)Wu;
+    (void)clargs_ptr;
+}
+
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       int                     k,
+                                       int                     n,
+                                       const CHAM_ipiv_t      *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *U,
+                                       void                  **clargs_ptr )
+{
+    (void)options;
+    (void)k;
+    (void)n;
+    (void)ipiv;
+    (void)ipivk;
+    (void)A;
+    (void)U;
+    (void)clargs_ptr;
+}
diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c
new file mode 100644
index 0000000000000000000000000000000000000000..6af43659c2768c2443684de411297fab9a68e003
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c
@@ -0,0 +1,141 @@
+/**
+ *
+ * @file starpu/codelet_zlaswp_batched.c
+ *
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Alycia Lisito
+ * @date 2024-10-21
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu_internal.h"
+#include "runtime_codelet_z.h"
+
+struct cl_laswp_batched_args_t {
+    int                      tasks_nbr;
+    int                      minmn;
+    int                      m0[CHAMELEON_BATCH_SIZE];
+    struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE];
+};
+
+#if !defined(CHAMELEON_SIMULATION)
+static void
+cl_zlaswp_batched_cpu_func( void *descr[],
+                            void *cl_arg )
+{
+    int          i, m0, minmn, *perm, *invp;
+    CHAM_tile_t *A, *U, *B;
+    struct cl_laswp_batched_args_t *clargs = ( struct cl_laswp_batched_args_t * ) cl_arg;
+
+    minmn = clargs->minmn;
+    perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
+    invp = (int *)STARPU_VECTOR_GET_PTR( descr[1] );
+    U    = (CHAM_tile_t *) cti_interface_get( descr[2] );
+    B    = (CHAM_tile_t *) cti_interface_get( descr[3] );
+
+    for ( i = 0; i < clargs->tasks_nbr; i++ ) {
+        A  = (CHAM_tile_t *) cti_interface_get( descr[ i + 4 ] );
+        m0 = clargs->m0[ i ];
+        TCORE_zlaswp_get( m0, A->m, A->n, minmn, A, U, perm );
+        TCORE_zlaswp_set( m0, A->m, A->n, minmn, B, A, invp );
+    }
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zlaswp_batched, cl_zlaswp_batched_cpu_func )
+
+void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
+                                 int                     m0,
+                                 int                     minmn,
+                                 int                     k,
+                                 int                     m,
+                                 int                     n,
+                                 void                   *ws,
+                                 const CHAM_ipiv_t      *ipiv,
+                                 int                     ipivk,
+                                 const CHAM_desc_t      *A,
+                                 const CHAM_desc_t      *Wu,
+                                 void                  **clargs_ptr )
+{
+    int task_num   = 0;
+    int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
+    int nhandles;
+    struct cl_laswp_batched_args_t *clargs = *clargs_ptr;
+    if ( A->get_rankof( A, m, n) != A->myrank ) {
+        return;
+    }
+
+    if( clargs == NULL ) {
+        clargs = malloc( sizeof( struct cl_laswp_batched_args_t ) ) ;
+        clargs->tasks_nbr = 0;
+        clargs->minmn     = minmn;
+        *clargs_ptr       = clargs;
+    }
+
+    task_num               = clargs->tasks_nbr;
+    clargs->m0[ task_num ] = m0;
+    clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, m, n);
+    clargs->handle_mode[ task_num ].mode   = STARPU_RW;
+    clargs->tasks_nbr ++;
+
+    if ( clargs->tasks_nbr == batch_size ) {
+        nhandles = clargs->tasks_nbr;
+        rt_starpu_insert_task(
+            &cl_zlaswp_batched,
+            STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
+            STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
+            STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
+            STARPU_RW | STARPU_COMMUTE, RTBLKADDR(Wu, ChamComplexDouble, A->myrank, n),
+            STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, k, n),
+            STARPU_DATA_MODE_ARRAY,     clargs->handle_mode, nhandles,
+            STARPU_PRIORITY,            options->priority,
+            STARPU_EXECUTE_ON_WORKER,   options->workerid,
+            0 );
+
+        /* clargs is freed by starpu. */
+        *clargs_ptr = NULL;
+    }
+}
+
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       int                     k,
+                                       int                     n,
+                                       const CHAM_ipiv_t      *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *A,
+                                       const CHAM_desc_t      *U,
+                                       void                  **clargs_ptr )
+{
+    struct cl_laswp_batched_args_t *clargs   = *clargs_ptr;
+    int                             nhandles;
+
+    if( clargs == NULL ) {
+        return;
+    }
+
+    nhandles = clargs->tasks_nbr;
+    rt_starpu_insert_task(
+        &cl_zlaswp_batched,
+        STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
+        STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
+        STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, k, n),
+        STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, k, n),
+        STARPU_DATA_MODE_ARRAY,     clargs->handle_mode, nhandles,
+        STARPU_PRIORITY,            options->priority,
+        STARPU_EXECUTE_ON_WORKER,   options->workerid,
+        0 );
+
+    /* clargs is freed by starpu. */
+    *clargs_ptr = NULL;
+}