diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index 6db9a8a40148a67fcaeda74f9a718c949b12e59d..635bbbb84a3436564d04476f4e366f9ac5edb10b 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -150,7 +150,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
         }
 
         /* Reduce globally (between MPI processes) */
-        INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
+        INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws );
     }
 
     /* Flush temporary data used for the pivoting */
@@ -196,7 +196,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
         }
         INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv );
 
-        INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
+        INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws );
     }
 
     free( clargs );
@@ -250,7 +250,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
 
             assert( j <= minmn );
             /* Reduce globally (between MPI processes) */
-            INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
+            INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws );
 
             if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) {
                 INSERT_TASK_zgetrf_blocked_trsm(
@@ -312,7 +312,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
 
             assert( j <= minmn );
             /* Reduce globally (between MPI processes) */
-            INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
+            INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws );
 
             if ( (b < (nbblock-1)) && (h == hmax-1) ) {
                 INSERT_TASK_zgetrf_blocked_trsm(
@@ -338,10 +338,12 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
                                int                         k,
                                RUNTIME_option_t           *options )
 {
+#if defined(CHAMELEON_USE_MPI)
     chameleon_get_proc_involved_in_panelk_2dbc( A, k, k, ws );
     if ( !ws->involved ) {
         return;
     }
+#endif
 
     /* TODO: Should be replaced by a function pointer */
     switch( ws->alg ) {
@@ -350,7 +352,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPivPerColumn:
-        if ( ws->batch_size > 0 ) {
+        if ( ws->batch_size_blas2 > 0 ) {
             chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options );
         }
         else {
@@ -359,7 +361,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
         break;
 
     case ChamGetrfPPiv:
-        if ( ws->batch_size > 0 ) {
+        if ( ws->batch_size_blas2 > 0 ) {
             chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options );
         }
         else {
@@ -392,19 +394,6 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
         int tempkm, tempkn, tempnn, minmn;
         int withlacpy;
 
-        chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
-        if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
-            INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
-            INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
-        }
-        if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
-            INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
-        }
-
-        if ( !ws->involved ) {
-            return;
-        }
-
         tempkm = A->get_blkdim( A, k, DIM_m, A->m );
         tempkn = A->get_blkdim( A, k, DIM_n, A->n );
         tempnn = A->get_blkdim( A, n, DIM_n, A->n );
@@ -433,8 +422,7 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
                                     ipiv, k, A(k, n), A(m, n) );
         }
 
-        INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n,
-                                     Wu(A->myrank, n), ws );
+        INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws );
     }
     break;
     default:
@@ -458,19 +446,6 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
         int tempkm, tempkn, tempnn, minmn;
         int withlacpy;
 
-        chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
-        if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
-            INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
-            INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
-        }
-        if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
-            INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
-        }
-
-        if ( !ws->involved ) {
-            return;
-        }
-
         void **clargs = malloc( sizeof(char *) );
         *clargs = NULL;
 
@@ -499,7 +474,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
         }
         INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs );
 
-        INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, Wu(A->myrank, n), ws );
+        INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws );
 
         free( clargs );
     }
@@ -509,6 +484,80 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
     }
 }
 
+static inline void
+chameleon_pzgetrf_panel_permute_forward( struct chameleon_pzgetrf_s *ws,
+                                         CHAM_desc_t                *A,
+                                         CHAM_ipiv_t                *ipiv,
+                                         int                         k,
+                                         int                         n,
+                                         RUNTIME_option_t           *options )
+{
+#if defined(CHAMELEON_USE_MPI)
+    chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
+    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+        INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
+        INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
+    }
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+        INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
+    }
+
+    if ( !ws->involved ) {
+        return;
+    }
+#endif
+
+    if ( ws->batch_size_swap > 0 ) {
+        chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options );
+    }
+    else {
+        chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
+    }
+}
+
+static inline void
+chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws,
+                                          CHAM_desc_t                *A,
+                                          CHAM_ipiv_t                *ipiv,
+                                          int                         k,
+                                          int                         n,
+                                          RUNTIME_option_t           *options,
+                                          RUNTIME_sequence_t         *sequence )
+{
+    int tempkm, tempnn;
+
+#if defined(CHAMELEON_USE_MPI)
+    chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
+    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+        INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved );
+        INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n );
+    }
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+        INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved );
+    }
+
+    if ( !ws->involved ) {
+        return;
+    }
+#endif
+
+    if ( ws->batch_size_swap > 0 ) {
+        chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options );
+    }
+    else {
+        chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
+    }
+
+    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
+
+        tempkm = A->get_blkdim( A, k, DIM_m, A->m );
+        tempnn = A->get_blkdim( A, n, DIM_n, A->n );
+        INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
+                            Wu(A->myrank, n), A(k, n) );
+        RUNTIME_data_flush( sequence, A(k, n) );
+    }
+}
+
 static inline void
 chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws,
                                    CHAM_desc_t                *A,
@@ -516,7 +565,7 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws,
                                    RUNTIME_option_t           *options )
 {
     CHAM_context_t  *chamctxt = chameleon_context_self();
-    int m, tempmm, tempkn, q;
+    int m, n, tempmm, tempkn, tempkm, p, q, involved, np;
     int lookahead = chamctxt->lookahead;
     int P         = chameleon_desc_datadist_get_iparam(A, 0);
     int Q         = chameleon_desc_datadist_get_iparam(A, 1);
@@ -561,6 +610,44 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws,
             RUNTIME_data_flush( options->sequence, A(m, k) );
         }
     }
+
+    tempkm = A->get_blkdim( A, k, DIM_m, A->m );
+    np = chameleon_desc_datadist_get_iparam(A, 1) * chameleon_desc_datadist_get_iparam(A, 0);
+#if defined(CHAMELEON_USE_MPI)
+    /* Send Akk for replicated trsm */
+    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
+        for ( p = 0; p < np; p++ ) {
+            involved = 0;
+            for ( n = k+1; n < A->nt; n++ ) {
+                if ( chameleon_p_involved_in_panelk_2dbc( A, n, p ) ) {
+                    involved = 1;
+                    break;
+                }
+            }
+            if ( involved ) {
+                INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn,
+                                    A(k, k), Wu(p, k) );
+            }
+        }
+    }
+    else {
+        involved = 0;
+        for ( n = k+1; n < A->nt; n++ ) {
+            if ( chameleon_involved_in_panelk_2dbc( A, n ) ) {
+                involved = 1;
+                break;
+            }
+        }
+        if ( involved ) {
+            INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn,
+                                A(k, k), Wu(A->myrank, k) );
+        }
+    }
+#else
+    INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn,
+                        A(k, k), Wu(A->myrank, k) );
+#endif
+    RUNTIME_data_flush( options->sequence, A(k, k) );
 }
 
 static inline void
@@ -584,30 +671,17 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     tempkm = A->get_blkdim( A, k, DIM_m, A->m );
     tempnn = A->get_blkdim( A, n, DIM_n, A->n );
 
-    if ( ws->batch_size > 0 ) {
-        chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options );
-    }
-    else {
-        chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
-    }
+    chameleon_pzgetrf_panel_permute_forward( ws, A, ipiv, k, n, options );
 
-    if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
-        for ( p = 0; p < ws->np_involved; p++ ) {
-            INSERT_TASK_ztrsm(
-                options,
-                ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
-                tempkm, tempnn, A->mb,
-                zone, A(k, k),
-                      Wu(ws->proc_involved[p], n) );
-            RUNTIME_data_flush( options->sequence, Wu(ws->proc_involved[p], n) );
-        }
-    }
-    else if ( ws->involved ) {
+#if defined(CHAMELEON_USE_MPI)
+    if ( ws->involved )
+#endif
+    {
         INSERT_TASK_ztrsm(
             options,
             ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
             tempkm, tempnn, A->mb,
-            zone, A(k, k),
+            zone, Wu(A->myrank, k),
                   Wu(A->myrank, n) );
     }
 
@@ -632,7 +706,6 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     }
 
     RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) );
-    RUNTIME_data_flush( options->sequence, A(k, k) );
     RUNTIME_data_flush( options->sequence, A(k, n) );
 }
 
@@ -683,54 +756,26 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
         }
 
         /* Flush panel k */
-        for (m = k; m < A->mt; m++) {
+        for (m = k+1; m < A->mt; m++) {
             RUNTIME_data_flush( sequence, A(m, k) );
         }
+        RUNTIME_data_flush( sequence, Wu(A->myrank, k) );
 
         RUNTIME_iteration_pop( chamctxt );
     }
     CHAMELEON_Desc_Flush( &(ws->Wl), sequence );
 
     /* Backward pivoting */
-    if ( ws->batch_size > 0 ) {
-        for (k = 1; k < min_mnt; k++) {
-            for (n = 0; n < k; n++) {
-                if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
-                    chameleon_involved_in_panelk_2dbc( A, n ) )
-                {
-                    chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options );
-                    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
-                        tempkm = A->get_blkdim( A, k, DIM_m, A->m );
-                        tempnn = A->get_blkdim( A, n, DIM_n, A->n );
-                        INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn,
-                                            Wu(A->myrank, n), A(k, n) );
-                        RUNTIME_data_flush( sequence, A(k, n) );
-                    }
-                }
-                RUNTIME_data_flush( sequence, Wu(A->myrank, n) );
-            }
-            RUNTIME_perm_flushk( sequence, IPIV, k );
-        }
-    }
-    else {
-        for (k = 1; k < min_mnt; k++) {
-            for (n = 0; n < k; n++) {
-                if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
-                    chameleon_involved_in_panelk_2dbc( A, n ) )
-                {
-                    chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
-                    if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
-                        tempkm = A->get_blkdim( A, k, DIM_m, A->m );
-                        tempnn = A->get_blkdim( A, n, DIM_n, A->n );
-                        INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn,
-                                            Wu(A->myrank, n), A(k, n) );
-                        RUNTIME_data_flush( sequence, A(k, n) );
-                    }
-                }
-                RUNTIME_data_flush( sequence, Wu(A->myrank, n) );
+    for (k = 1; k < min_mnt; k++) {
+        for (n = 0; n < k; n++) {
+            if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
+                 chameleon_involved_in_panelk_2dbc( A, n ) )
+            {
+                chameleon_pzgetrf_panel_permute_backward( ws, A, IPIV, k, n, &options, sequence );
             }
-            RUNTIME_perm_flushk( sequence, IPIV, k );
+            RUNTIME_data_flush( sequence, Wu(A->myrank, n) );
         }
+        RUNTIME_perm_flushk( sequence, IPIV, k );
     }
     CHAMELEON_Desc_Flush( &(ws->Wu), sequence );
 
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 514e89d3e375a38d487efcddf6aee07505660f00..254020a55c478dcb6982d5b002fff2d5e69c9902 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -98,10 +98,35 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         chameleon_cleanenv( algostr );
     }
 
-    ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 );
-    if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) {
-        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" );
-        ws->batch_size = CHAMELEON_BATCH_SIZE;
+    {
+        char *allreduce = chameleon_getenv( "CHAMELEON_GETRF_ALL_REDUCE" );
+
+        if ( allreduce != NULL ) {
+            if ( strcasecmp( allreduce, "cham_spu_tasks" ) == 0 ) {
+                ws->alg_allreduce = ChamStarPUTasks;
+            }
+            else {
+                chameleon_error( "CHAMELEON_zgetrf_WS_Alloc", "CHAMELEON_GETRF_ALL_REDUCE is not one of chameleon_starpu_tasks, chameleon_starpu, chameleon_starpu_mpi, chameleon_mpi => Switch back to chameleon_starpu_tasks\n" );
+                ws->alg_allreduce = ChamStarPUTasks;
+            }
+        }
+        chameleon_cleanenv( allreduce );
+    }
+
+    ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", 0 );
+    if ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) {
+        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS2 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS2 value\n" );
+        ws->batch_size_blas2 = CHAMELEON_BATCH_SIZE;
+    }
+    ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", 0 );
+    if ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) {
+        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS3 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS3 value\n" );
+        ws->batch_size_blas3 = CHAMELEON_BATCH_SIZE;
+    }
+    ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", 0 );
+    if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) {
+        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_SWAP must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_SWAP value\n" );
+        ws->batch_size_swap = CHAMELEON_BATCH_SIZE;
     }
 
     ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX );
diff --git a/control/compute_z.h b/control/compute_z.h
index b75c303a1131e5c9f142803e66a227fe0c04de91..1229a1797915be3d358cf018f3b779bca8aeefe2 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -43,17 +43,20 @@ struct chameleon_pzgemm_s {
  * @brief Data structure to handle the GETRF workspaces with partial pivoting
  */
 struct chameleon_pzgetrf_s {
-    cham_getrf_t   alg;
-    int            ib;         /**< Internal blocking parameter                   */
-    int            batch_size; /**< Batch size for the panel                      */
-    int            ringswitch; /**< Define when to switch to ring bcast           */
-    CHAM_desc_t    U;
-    CHAM_desc_t    Up;         /**< Workspace used for the panel factorization    */
-    CHAM_desc_t    Wu;         /**< Workspace used for the permutation and update */
-    CHAM_desc_t    Wl;         /**< Workspace used the update                     */
-    int           *proc_involved;
-    unsigned int   involved;
-    int            np_involved;
+    cham_getrf_t            alg;
+    cham_getrf_allreduce_t  alg_allreduce;
+    int                     ib;         /**< Internal blocking parameter */
+    int                     batch_size_blas2; /**< Batch size for the blas 2 operations of the panel factorization */
+    int                     batch_size_blas3; /**< Batch size for the blas 3 operations of the panel factorization */
+    int                     batch_size_swap;  /**< Batch size for the permutation */
+    int                     ringswitch; /**< Define when to switch to ring bcast           */
+    CHAM_desc_t             U;
+    CHAM_desc_t             Up; /**< Workspace used for the panel factorization    */
+    CHAM_desc_t             Wu; /**< Workspace used for the permutation and update */
+    CHAM_desc_t             Wl; /**< Workspace used the update                     */
+    int                    *proc_involved;
+    unsigned int            involved;
+    int                     np_involved;
 };
 
 /**
diff --git a/control/descriptor_helpers.c b/control/descriptor_helpers.c
index d5e1430638e8b507e4ac6163869d87b3021d97f4..6a0492111203815999797bc0d40243af1d6391d3 100644
--- a/control/descriptor_helpers.c
+++ b/control/descriptor_helpers.c
@@ -100,6 +100,26 @@ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k ) {
     return ( myrank % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) );
 }
 
+/**
+ * @brief Test if the MPI process p is involved in the panel k for 2DBC distributions.
+ *
+ * @param[in] A
+ *        The matrix descriptor.
+ *
+ * @param[in] k
+ *        The index of the panel to test.
+ *
+ * @param[in] p
+ *        The rank of the MPI process.
+ *
+ * @return 1 if the current MPI process contributes to the panel k.
+ *         0 if the current MPI process doesn't contribute to the panel k.
+ *
+ */
+int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p ) {
+    return ( p % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) );
+}
+
 /**
  * @brief Test if the current MPI process is involved in the panel k for 2DBC distributions.
  *
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index 09bfc942f5fc1d2e829616b61b79b878324bf61c..88c1f653b66eaca7aa597e0e34de9b0b9ad416b2 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -290,6 +290,13 @@ typedef enum chameleon_getrf_e {
     ChamGetrfPPivPerColumn  = 3,
 } cham_getrf_t;
 
+/**
+ * @brief Chameleon GETRF all reduce algorithm variants
+ */
+typedef enum chameleon_getrf_allreduce_e {
+    ChamStarPUTasks,
+} cham_getrf_allreduce_t;
+
 #define ChameleonTrd            1001
 #define ChameleonBrd            1002
 
diff --git a/include/chameleon/descriptor_helpers.h b/include/chameleon/descriptor_helpers.h
index 9e60ef27dda1d76e3af3c8f5fe3a2b427ae719c5..f8caf508060d4a2f5aaaf7a1ef12ce43d31505de 100644
--- a/include/chameleon/descriptor_helpers.h
+++ b/include/chameleon/descriptor_helpers.h
@@ -64,6 +64,7 @@ int chameleon_getrankof_custom        ( const CHAM_desc_t *A, int m, int n );
  */
 
 int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int An );
+int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p );
 void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A,
                                                  int                k,
                                                  int                n,
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 5f1bbcd322293e3104fca313974096bcf711de71..bf3831af524b2cbea33377a4ff8ab4ce1e124bb6 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -575,13 +575,13 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
                                       CHAM_desc_t *U, int Um, int Un,
                                       CHAM_ipiv_t *ws );
 
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
+void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *A,
                                   CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
                                   int                     k,
                                   int                     h,
-                                  int                     n );
+                                  int                     n,
+                                  void                   *ws );
 
 /**
  ********************************************************************************
@@ -600,6 +600,16 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
  * @param[in] A
  *          The descriptor of the matrix A.
  *
+ * @param[inout] U
+ *          The descriptor of the worskpace used for the permutation in the LU
+ *          factorization with partial pivoting.
+ *
+ * @param[in] Um
+ *          The row index of the tile used in U.
+ *
+ * @param[in] Un
+ *          The column index of the tile used in U.
+ *
  * @param[in] ipiv
  *          The pivot structure that contains the informations for the LU
  *          factorization with partial pivoting.
@@ -613,16 +623,6 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
  * @param[in] n
  *          The number of columns in the tile U(Um, Un).
  *
- * @param[inout] U
- *          The descriptor of the worskpace used for the permutation in the LU
- *          factorization with partial pivoting.
- *
- * @param[in] Um
- *          The row index of the tile used in U.
- *
- * @param[in] Un
- *          The column index of the tile used in U.
- *
  * @param[in] ws
  *          The workspace to handle the data in the LU factorization with
  *          partial pivoting.
@@ -631,13 +631,13 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
  */
 void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                                   const CHAM_desc_t      *A,
+                                  CHAM_desc_t            *U,
+                                  int                     Um,
+                                  int                     Un,
                                   CHAM_ipiv_t            *ipiv,
                                   int                     ipivk,
                                   int                     k,
                                   int                     n,
-                                  CHAM_desc_t            *U,
-                                  int                     Um,
-                                  int                     Un,
                                   void                   *ws );
 
 /**
diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c
index b088283254cd64e1bada1628939436327b8a2789..197842ea3e96fdba1a9e1d67152a8a5b3e6196ea 100644
--- a/runtime/openmp/codelets/codelet_zipiv_allreduce.c
+++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c
@@ -17,19 +17,19 @@
  */
 #include "chameleon_openmp.h"
 
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
+void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *A,
                                   CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
                                   int                     k,
                                   int                     h,
-                                  int                     n )
+                                  int                     n,
+                                  void                   *ws )
 {
-    (void)A;
     (void)options;
+    (void)A;
     (void)ipiv;
-    (void)proc_involved;
     (void)k;
     (void)h;
     (void)n;
+    (void)ws;
 }
diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c
index cb77c806bcb8ce47a62e7b4e19b2dad3dafc8218..7aeb24faebda059ad96dec2819b8793d467eae05 100644
--- a/runtime/openmp/codelets/codelet_zperm_allreduce.c
+++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c
@@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              const CHAM_desc_t      *A,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
                              CHAM_ipiv_t            *ipiv,
                              int                     ipivk,
                              int                     k,
                              int                     n,
-                             CHAM_desc_t            *U,
-                             int                     Um,
-                             int                     Un,
                              void                   *ws )
 {
     (void)options;
     (void)A;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)ipiv;
     (void)ipivk;
     (void)k;
     (void)n;
-    (void)U;
-    (void)Um;
-    (void)Un;
     (void)ws;
 }
diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c
index 75e0611647a464cad9c37e59a5619ebefaae19ed..d6bd3f4c06baf9b1c44e4db6971c88c09acd432f 100644
--- a/runtime/parsec/codelets/codelet_zipiv_allreduce.c
+++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c
@@ -17,19 +17,19 @@
  */
 #include "chameleon_parsec.h"
 
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
+void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *A,
                                   CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
                                   int                     k,
                                   int                     h,
-                                  int                     n )
+                                  int                     n,
+                                  void                   *ws )
 {
-    (void)A;
     (void)options;
+    (void)A;
     (void)ipiv;
-    (void)proc_involved;
     (void)k;
     (void)h;
     (void)n;
+    (void)ws;
 }
diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c
index 30890f8114b857b7c12804c526f4aa4c875b63a1..5acfa4a2b099785e7397807309d104d5421c34fb 100644
--- a/runtime/parsec/codelets/codelet_zperm_allreduce.c
+++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c
@@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              const CHAM_desc_t      *A,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
                              CHAM_ipiv_t            *ipiv,
                              int                     ipivk,
                              int                     k,
                              int                     n,
-                             CHAM_desc_t            *U,
-                             int                     Um,
-                             int                     Un,
                              void                   *ws )
 {
     (void)options;
     (void)A;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)ipiv;
     (void)ipivk;
     (void)k;
     (void)n;
-    (void)U;
-    (void)Um;
-    (void)Un;
     (void)ws;
 }
diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c
index e88269e931f3f210282a1382d44a6ff9516c7453..0186fd142b67d08dcfca01e9b8184b471362ce1c 100644
--- a/runtime/quark/codelets/codelet_zipiv_allreduce.c
+++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c
@@ -17,19 +17,19 @@
  */
 #include "chameleon_quark.h"
 
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
+void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *A,
                                   CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
                                   int                     k,
                                   int                     h,
-                                  int                     n )
+                                  int                     n,
+                                  void                   *ws )
 {
-    (void)A;
     (void)options;
+    (void)A;
     (void)ipiv;
-    (void)proc_involved;
     (void)k;
     (void)h;
     (void)n;
+    (void)ws;
 }
diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c
index 52281451dd038a9276a2040b9f4c08f7effa63f7..f6c5f98e6d59ed67db6ae9ca7dbe37abca31d617 100644
--- a/runtime/quark/codelets/codelet_zperm_allreduce.c
+++ b/runtime/quark/codelets/codelet_zperm_allreduce.c
@@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              const CHAM_desc_t      *A,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
                              CHAM_ipiv_t            *ipiv,
                              int                     ipivk,
                              int                     k,
                              int                     n,
-                             CHAM_desc_t            *U,
-                             int                     Um,
-                             int                     Un,
                              void                   *ws )
 {
     (void)options;
     (void)A;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)ipiv;
     (void)ipivk;
     (void)k;
     (void)n;
-    (void)U;
-    (void)Um;
-    (void)Un;
     (void)ws;
 }
diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c
index 2e04493df242f90fb18499abfab703724e90d197..011785aa2f459629adaabe457e77a908786ac14d 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_batched.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c
@@ -74,7 +74,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
                                           CHAM_ipiv_t *ipiv )
 {
     int          task_num   = 0;
-    int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
+    int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2;
     void (*callback)(void*) = NULL;
     struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
     int rankA = A->get_rankof( A, Am, An );
@@ -241,8 +241,9 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
                                           void **clargs_ptr,
                                           CHAM_ipiv_t *ipiv )
 {
-    int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
-    int          ib         = ((struct chameleon_pzgetrf_s *)ws)->ib;
+    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws;
+    int          ib         = tmp->ib;
+    int          batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3;
     int          task_num   = 0;
     void (*callback)(void*) = NULL;
     int accessU, access_npiv, access_ipiv, access_ppiv;
diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
index a81f0d08eef1fb94b6846606b5e63aae64ab075c..48ecdd0c33fa07f9cfd326f775b0f31fb48a67b8 100644
--- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c
+++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
@@ -22,18 +22,18 @@
 struct cl_redux_args_t {
     int h;
     int n;
-    int k;
 };
 
-static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
+static void
+zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me,
+                          cppi_interface_t *cppi_src,
+                          int               h,
+                          int               n )
 {
-    struct cl_redux_args_t *clargs      = (struct cl_redux_args_t *) cl_arg;
-    cppi_interface_t       *cppi_me     = ((cppi_interface_t *) descr[0]);
-    cppi_interface_t       *cppi_src    = ((cppi_interface_t *) descr[1]);
-    CHAM_pivot_t           *nextpiv_me  = &(cppi_me->pivot);
-    CHAM_pivot_t           *nextpiv_src = &(cppi_src->pivot);
-    CHAMELEON_Complex64_t  *pivrow_me   = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow);
-    CHAMELEON_Complex64_t  *pivrow_src  = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow);
+    CHAM_pivot_t          *nextpiv_me  = &(cppi_me->pivot);
+    CHAM_pivot_t          *nextpiv_src = &(cppi_src->pivot);
+    CHAMELEON_Complex64_t *pivrow_me   = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow);
+    CHAMELEON_Complex64_t *pivrow_src  = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow);
 
     cppi_display_dbg( cppi_me,  stderr, "Global redux Inout: ");
     cppi_display_dbg( cppi_src, stderr, "Global redux Input: ");
@@ -43,33 +43,42 @@ static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
     assert( cppi_me->flttype   == cppi_src->flttype   );
     assert( cppi_me->arraysize == cppi_src->arraysize );
 
-    if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) {
+    if ( cabs( pivrow_src[ h ] ) > cabs( pivrow_me[ h ] ) ) {
         nextpiv_me->blkm0  = nextpiv_src->blkm0;
         nextpiv_me->blkidx = nextpiv_src->blkidx;
-        cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 );
+        cblas_zcopy( n, pivrow_src, 1, pivrow_me, 1 );
     }
 
     /* Let's copy the diagonal row if needed */
     if ( ( cppi_src->has_diag == 1 ) &&
          ( cppi_me->has_diag  == -1 ) )
     {
-        cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 );
-        assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * clargs->n );
+        cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 );
+        assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n );
         cppi_me->has_diag = 1;
     }
 
     cppi_display_dbg( cppi_me,  stderr, "Global redux Inout(After): ");
 }
 
+static void
+cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_redux_args_t *clargs   = (struct cl_redux_args_t *) cl_arg;
+    cppi_interface_t       *cppi_me  = ((cppi_interface_t *) descr[0]);
+    cppi_interface_t       *cppi_src = ((cppi_interface_t *) descr[1]);
+    zipiv_allreduce_cpu_func(  cppi_me, cppi_src, clargs->h, clargs->n );
+}
+
 CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func )
 
-void
-INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv,
-                                  int          me,
-                                  int          dst,
-                                  int          k,
-                                  int          h,
-                                  const RUNTIME_option_t *options )
+static void
+INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     me,
+                                  int                     dst,
+                                  int                     k,
+                                  int                     h )
 {
     rt_starpu_insert_task(
         NULL,
@@ -79,20 +88,19 @@ INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv,
         0 );
 }
 
-void
-INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv,
-                                  int          me,
-                                  int          src,
-                                  int          k,
-                                  int          h,
-                                  int          n,
-                                  const RUNTIME_option_t *options )
+static void
+INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     me,
+                                  int                     src,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
 {
     struct cl_redux_args_t *clargs;
-    clargs = malloc( sizeof( struct cl_redux_args_t ) );
+    clargs    = malloc( sizeof( struct cl_redux_args_t ) );
     clargs->h = h;
     clargs->n = n;
-    clargs->k = k;
 
     rt_starpu_insert_task(
         &cl_zipiv_allreduce,
@@ -106,16 +114,17 @@ INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv,
     starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) );
 }
 
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
-                                  CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
-                                  int                     k,
-                                  int                     h,
-                                  int                     n )
+static void
+zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options,
+                                       CHAM_desc_t            *A,
+                                       CHAM_ipiv_t            *ipiv,
+                                       int                    *proc_involved,
+                                       int                     k,
+                                       int                     h,
+                                       int                     n )
 {
-    int np_involved   = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k);
-    int np_iter       = np_involved;
+    int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k);
+    int np_iter     = np_involved;
     int p_recv, p_send, me;
     int shift = 1;
 
@@ -140,29 +149,48 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
             p_send = proc_involved[ ( me + shift               ) % np_involved ];
             p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ];
 
-            INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h,    options );
-            INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options );
+            INSERT_TASK_zipiv_allreduce_send( options, ipiv, A->myrank, p_send, k, h    );
+            INSERT_TASK_zipiv_allreduce_recv( options, ipiv, A->myrank, p_recv, k, h, n );
 
             shift   = shift << 1;
             np_iter = chameleon_ceil( np_iter, 2 );
         }
     }
 }
+
+void
+INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                             CHAM_desc_t            *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     k,
+                             int                     h,
+                             int                     n,
+                             void                   *ws )
+{
+    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws;
+    cham_getrf_allreduce_t alg = tmp->alg_allreduce;
+    switch( alg ) {
+    case ChamStarPUTasks:
+    default:
+        zipiv_allreduce_chameleon_starpu_task( options, A, ipiv, tmp->proc_involved, k, h, n );
+    }
+}
 #else
-void INSERT_TASK_zipiv_allreduce( CHAM_desc_t            *A,
-                                  const RUNTIME_option_t *options,
-                                  CHAM_ipiv_t            *ipiv,
-                                  int                    *proc_involved,
-                                  int                     k,
-                                  int                     h,
-                                  int                     n )
+void
+INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options,
+                             CHAM_desc_t            *A,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     k,
+                             int                     h,
+                             int                     n,
+                             void                   *ws )
 {
     if ( h > 0 ) {
         starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) );
     }
 
     (void)options;
-    (void)proc_involved;
+    (void)ws;
     (void)n;
 }
 #endif
diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c
index b17f26a486dc87e5d8dcb807369bfa431e809b06..303e6a674b564a9fbe3833931a5190af9e8ed136 100644
--- a/runtime/starpu/codelets/codelet_zlaswp_batched.c
+++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c
@@ -72,7 +72,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
                                  void                  **clargs_ptr )
 {
     int task_num   = 0;
-    int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
+    int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap;
     int nhandles;
     struct cl_laswp_batched_args_t *clargs = *clargs_ptr;
     if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) {
diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c
index 4c33a2e5086199af65219e86189f47aba18c7755..1c8d44164e9a97dec5427f2a9d775cb7f28b9315 100644
--- a/runtime/starpu/codelets/codelet_zperm_allreduce.c
+++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c
@@ -102,14 +102,14 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
 {
     struct cl_redux_args_t *clargs;
     clargs = malloc( sizeof( struct cl_redux_args_t ) );
-    clargs->tempmm = tempmm;
-    clargs->n      = n;
-    clargs->p      = p;
-    clargs->q      = q;
-    clargs->p_first  = p_first;
-    clargs->me     = me;
-    clargs->shift  = shift;
-    clargs->np_inv = np;
+    clargs->tempmm  = tempmm;
+    clargs->n       = n;
+    clargs->p       = p;
+    clargs->q       = q;
+    clargs->p_first = p_first;
+    clargs->me      = me;
+    clargs->shift   = shift;
+    clargs->np_inv  = np;
 
     rt_starpu_insert_task(
         &cl_zperm_allreduce,
@@ -124,20 +124,19 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
     starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) );
 }
 
-void
-INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
-                             const CHAM_desc_t      *A,
-                             CHAM_ipiv_t            *ipiv,
-                             int                     ipivk,
-                             int                     k,
-                             int                     n,
-                             CHAM_desc_t            *U,
-                             int                     Um,
-                             int                     Un,
-                             void                   *ws )
+static void
+zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
+                                       const CHAM_desc_t          *A,
+                                       CHAM_desc_t                *U,
+                                       int                         Um,
+                                       int                         Un,
+                                       CHAM_ipiv_t                *ipiv,
+                                       int                         ipivk,
+                                       int                         k,
+                                       int                         n,
+                                       struct chameleon_pzgetrf_s *ws)
 {
-    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws;
-    int *proc_involved = tmp->proc_involved;
+    int *proc_involved = ws->proc_involved;
     int  np_involved   = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k);
     int  np_iter       = np_involved;
     int  p_recv, p_send, me, p_first;
@@ -169,6 +168,27 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
     }
 }
 
+void
+INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
+                             const CHAM_desc_t      *A,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
+                             CHAM_ipiv_t            *ipiv,
+                             int                     ipivk,
+                             int                     k,
+                             int                     n,
+                             void                   *ws )
+{
+    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws;
+    cham_getrf_allreduce_t alg = tmp->alg_allreduce;
+    switch( alg ) {
+    case ChamStarPUTasks:
+    default:
+        zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp );
+    }
+}
+
 void
 INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options,
                                     CHAM_desc_t            *A,
@@ -284,24 +304,24 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options,
 void
 INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options,
                              const CHAM_desc_t      *A,
+                             CHAM_desc_t            *U,
+                             int                     Um,
+                             int                     Un,
                              CHAM_ipiv_t            *ipiv,
                              int                     ipivk,
                              int                     k,
                              int                     n,
-                             CHAM_desc_t            *U,
-                             int                     Um,
-                             int                     Un,
                              void                   *ws )
 {
     (void)options;
     (void)A;
+    (void)U;
+    (void)Um;
+    (void)Un;
     (void)ipiv;
     (void)ipivk;
     (void)k;
     (void)n;
-    (void)U;
-    (void)Um;
-    (void)Un;
     (void)ws;
 }
 #endif