diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c
index cfe7f507ede116ca6f0385f0730b4fe96abf4891..b4f1de9ff1848677a1177a609abe3d1c788ef46e 100644
--- a/compute/pzgetrf.c
+++ b/compute/pzgetrf.c
@@ -23,10 +23,11 @@
  */
 #include "control/common.h"
 
-#define A(m,n)  A,        m, n
-#define U(m,n)  &(ws->U), m, n
+#define A(m,n)   A,         m, n
+#define U(m,n)   &(ws->U),  m, n
 #define Up(m,n)  &(ws->Up), m, n
 #define Wu(m,n)  &(ws->Wu), m, n
+#define Wl(m,n)  &(ws->Wl), m, n
 
 /*
  * All the functions below are panel factorization variant.
@@ -389,6 +390,7 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
     {
         int m;
         int tempkm, tempkn, tempnn, minmn;
+        int withlacpy;
 
         chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
         if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
@@ -409,8 +411,11 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
         minmn  = chameleon_min( tempkm, tempkn );
 
         /* Extract selected rows into U */
+        withlacpy = options->withlacpy;
+        options->withlacpy = 1;
         INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
                             A(k, n), Wu(A->myrank, n) );
+        options->withlacpy = withlacpy;
 
         /*
          * perm array is made of size tempkm for the first row especially.
@@ -451,6 +456,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
     {
         int m;
         int tempkm, tempkn, tempnn, minmn;
+        int withlacpy;
 
         chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws );
         if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) {
@@ -474,8 +480,11 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
         minmn  = chameleon_min( tempkm, tempkn );
 
         /* Extract selected rows into U */
+        withlacpy = options->withlacpy;
+        options->withlacpy = 1;
         INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
                             A(k, n), Wu(A->myrank, n) );
+        options->withlacpy = withlacpy;
 
         /*
          * perm array is made of size tempkm for the first row especially.
@@ -500,6 +509,41 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws,
     }
 }
 
+static inline void
+chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws,
+                                   CHAM_desc_t                *A,
+                                   int                         k,
+                                   RUNTIME_option_t           *options )
+{
+    CHAM_context_t  *chamctxt = chameleon_context_self();
+    int m, tempmm, tempkn, q;
+    int lookahead = chamctxt->lookahead;
+    int lq        = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1);
+    int myp       = A->myrank / chameleon_desc_datadist_get_iparam(A, 1);
+
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+
+    for ( m = k+1; m < A->mt; m++ ) {
+        if ( m % chameleon_desc_datadist_get_iparam(A, 0) != myp ) continue;
+
+        tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+        INSERT_TASK_zlacpy(
+            options,
+            ChamUpperLower, tempmm, tempkn,
+            A( m, k ),
+            Wl( m, ( k % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) );
+
+        for ( q = 1; q < chameleon_desc_datadist_get_iparam(A, 1); q++ ) {
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempmm, tempkn,
+                Wl( m, ( ( k + q - 1 ) % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ),
+                Wl( m, ( ( k + q )     % chameleon_desc_datadist_get_iparam(A, 1) ) + lq ) );
+        }
+        RUNTIME_data_flush( options->sequence, A(m, k) );
+    }
+}
+
 static inline void
 chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
                                 CHAM_desc_t                *A,
@@ -510,9 +554,14 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
 {
     const CHAMELEON_Complex64_t zone  = (CHAMELEON_Complex64_t) 1.0;
     const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0;
+    CHAM_context_t  *chamctxt = chameleon_context_self();
 
     int m, tempkm, tempmm, tempnn, rankAmn, p;
 
+    int lookahead = chamctxt->lookahead;
+    int myq       = A->myrank % chameleon_desc_datadist_get_iparam(A, 1);
+    int lq        = (k % lookahead) * chameleon_desc_datadist_get_iparam(A, 1);
+
     tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
     tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
 
@@ -531,6 +580,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
                 tempkm, tempnn, A->mb,
                 zone, A(k, k),
                       Wu(ws->proc_involved[p], n) );
+            RUNTIME_data_flush( options->sequence, Wu(ws->proc_involved[p], n) );
         }
     }
     else if ( ws->involved ) {
@@ -543,16 +593,18 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     }
 
     for (m = k+1; m < A->mt; m++) {
-        tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+        tempmm  = m == A->mt-1 ? A->m-m*A->mb : A->mb;
         rankAmn = A->get_rankof( A, m, n );
 
-        INSERT_TASK_zgemm(
-            options,
-            ChamNoTrans, ChamNoTrans,
-            tempmm, tempnn, A->mb, A->mb,
-            mzone, A(m, k),
-                   Wu(rankAmn, n),
-            zone,  A(m, n) );
+        if ( A->myrank == rankAmn ) {
+            INSERT_TASK_zgemm(
+                options,
+                ChamNoTrans, ChamNoTrans,
+                tempmm, tempnn, A->mb, A->mb,
+                mzone, Wl( m, myq + lq ),
+                       Wu( A->myrank, n ),
+                zone,  A( m, n ) );
+        }
     }
 
     if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) {
@@ -561,6 +613,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
     }
 
     RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) );
+    RUNTIME_data_flush( options->sequence, A(k, k) );
     RUNTIME_data_flush( options->sequence, A(k, n) );
 }
 
@@ -599,6 +652,8 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
         }
         options.forcesub = 0;
 
+        chameleon_pzgetrf_panel_update_ws( ws, A, k, &options );
+
         for (n = k+1; n < A->nt; n++) {
             options.priority = A->nt-n;
             if ( chameleon_involved_in_panelk_2dbc( A, k ) ||
@@ -615,6 +670,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
 
         RUNTIME_iteration_pop( chamctxt );
     }
+    CHAMELEON_Desc_Flush( &(ws->Wl), sequence );
 
     /* Backward pivoting */
     if ( ws->batch_size > 0 ) {
@@ -657,6 +713,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,
             RUNTIME_perm_flushk( sequence, IPIV, k );
         }
     }
+    CHAMELEON_Desc_Flush( &(ws->Wu), sequence );
 
     /* Initialize IPIV with default values if needed */
     if ( (ws->alg == ChamGetrfNoPivPerColumn) ||
diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 9dabf735d3280d77f92ea30388a291a2db8cb861..9c3377bda312d3abe14a5d5dce92cbd26e547386 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -57,6 +57,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
 {
     CHAM_context_t             *chamctxt;
     struct chameleon_pzgetrf_s *ws;
+    int lookahead;
 
     chamctxt = chameleon_context_self();
     if ( chamctxt == NULL ) {
@@ -127,6 +128,12 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
                              A->mb * chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), A->n, 0, 0,
                              A->mb * chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), A->n, chameleon_desc_datadist_get_iparam(A, 0) * chameleon_desc_datadist_get_iparam(A, 1), 1,
                              NULL, NULL, NULL, A->get_rankof_init_arg );
+        lookahead = chamctxt->lookahead;
+        chameleon_desc_init( &(ws->Wl), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, A->mb, A->nb, (A->mb * A->nb),
+                             A->mt * A->mb, A->nb * chameleon_desc_datadist_get_iparam(A, 1) * lookahead, 0, 0,
+                             A->mt * A->mb, A->nb * chameleon_desc_datadist_get_iparam(A, 1) * lookahead, chameleon_desc_datadist_get_iparam(A, 0), chameleon_desc_datadist_get_iparam(A, 1),
+                             NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg );
     }
 
     /* Set ib to 1 if per column algorithm */
@@ -195,6 +202,7 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws )
          ( ws->alg == ChamGetrfPPivPerColumn  ) )
     {
         chameleon_desc_destroy( &(ws->Wu) );
+        chameleon_desc_destroy( &(ws->Wl) );
     }
     free( ws );
 }
@@ -394,6 +402,7 @@ CHAMELEON_zgetrf_Tile( CHAM_desc_t *A, CHAM_ipiv_t *IPIV )
     ws = CHAMELEON_zgetrf_WS_Alloc( A );
     CHAMELEON_zgetrf_Tile_Async( A, IPIV, ws, sequence, &request );
     CHAMELEON_Desc_Flush( A, sequence );
+    CHAMELEON_Ipiv_Flush( IPIV, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
     CHAMELEON_zgetrf_WS_Free( ws );
diff --git a/control/compute_z.h b/control/compute_z.h
index df4b18d79010af05a0d5f482b7e9d9390442c29a..f1002522d6f430c4cbf58251c39c1604526f4dac 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -49,6 +49,7 @@ struct chameleon_pzgetrf_s {
     CHAM_desc_t    U;
     CHAM_desc_t    Up; /**< Workspace used for the panel factorization    */
     CHAM_desc_t    Wu; /**< Workspace used for the permutation and update */
+    CHAM_desc_t    Wl; /**< Workspace used the update                     */
     int           *proc_involved;
     unsigned int   involved;
     int            np_involved;