diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index d1cf8e4b6ca650ebca90412dad42b9b64e5fd174..ea61c5425ccb3291c047deae8cf378f13cd47553 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -169,6 +169,10 @@ magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 
     int i, k, ib, lddwork, old_i, old_ib, rows, cols;
     double _Complex one=1.;
+    CUstream stream;
+
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
 
     if (m < 0) {
         return -1;
@@ -221,7 +225,7 @@ magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 
             /* copy the lower diag tile into d_A */
             magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
-                               dd, ldd, da_ref(old_i, old_i), ldda);
+                               dd, ldd, da_ref(old_i, old_i), ldda, stream);
 
         }
 
@@ -280,7 +284,7 @@ magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                 }
                 /* copy the upper diag tile into d_A */
                 magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
-                                   dd, ldd, da_ref(old_i, old_i), ldda);
+                                   dd, ldd, da_ref(old_i, old_i), ldda, stream);
             }
         }
 
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 4171aa70e77fd6dd073fbe4d3795e875a8d74179..7bf52c2234f4de0979dcf4659cad575759bbcc04 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -158,11 +158,11 @@ magma_int_t
 magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
                    magma_int_t M, magma_int_t N,
                    magmaDoubleComplex *A, magma_int_t LDA,
-                   magmaDoubleComplex *B, magma_int_t LDB)
+                   magmaDoubleComplex *B, magma_int_t LDB,
+                   CUstream stream)
 {
     int i, j;
     magmaDoubleComplex *cola, *colb;
-    CUstream stream;
     cublasHandle_t handle;
     cublasStatus_t stat;
 
@@ -172,7 +172,6 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
@@ -196,13 +195,19 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            cublasZcopy(handle, i+1, cola, 1, colb, 1);
+//            cublasZcopy(handle, i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
         }
     }else{
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
+//            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
         }
     }
 
@@ -215,7 +220,8 @@ magma_int_t
 magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
                    magma_int_t M, magma_int_t N,
                    magmaDoubleComplex *A, magma_int_t LDA,
-                   magmaDoubleComplex *B, magma_int_t LDB)
+                   magmaDoubleComplex *B, magma_int_t LDB,
+                   CUstream stream)
 {
     int i, j;
     magmaDoubleComplex *cola, *colb;
@@ -237,19 +243,19 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            //cublasZcopy(i+1, cola, 1, colb, 1);
-            cudaMemcpy(colb , cola,
-                       (i+1)*sizeof(cuDoubleComplex),
-                       cudaMemcpyDeviceToDevice );
+//            cublasZcopy(i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
         }
     }else{
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            //cublasZcopy(M-i, cola + i, 1, colb + i, 1);
-            cudaMemcpy(colb+i , cola+i,
-                       (M-i)*sizeof(cuDoubleComplex),
-                       cudaMemcpyDeviceToDevice );
+//            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
         }
     }
 
@@ -276,6 +282,10 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 
     int i, k, ib, lddwork, old_i, old_ib, rows, cols;
     double _Complex one=1.;
+    CUstream stream;
+
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
 //    int lwkopt = n * nb;
 //    hwork[0] = *((magmaDoubleComplex*) &lwkopt);
 
@@ -297,16 +307,15 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 
     /* lower parts of little T must be zero: memset to 0 for simplicity */
     memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex));
-    cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
+    cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
 
     /* copy first panel of A on the host */
 //	cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex),
 //                    da_ref(0, 0), ldda,
 //                    v, ldv);
-    /* copy first panel of A on the host */
     cudaMemcpy( v, da_ref(0,0),
                 m*min(nb,n)*sizeof(magmaDoubleComplex),
-                cudaMemcpyDeviceToHost );
+                cudaMemcpyDeviceToHost);
 
     /* Use blocked code initially */
     for (i = 0; i < k; i += nb) {
@@ -324,7 +333,7 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
             /* copy panel of A from device to host */
             cudaMemcpy( v, da_ref(0,i),
                         m*ib*sizeof(magmaDoubleComplex),
-                        cudaMemcpyDeviceToHost );
+                        cudaMemcpyDeviceToHost);
 
             /* Apply H' to A(i:m,i+2*ib:n) from the left */
             cols = n-old_i-2*old_ib;
@@ -338,7 +347,7 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 
             /* copy the upper diag tile into d_A */
             magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib,
-                               dd, ldd, da_ref(old_i, old_i), ldda);
+                               dd, ldd, da_ref(old_i, old_i), ldda, stream);
 
         }
 
@@ -397,7 +406,7 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
                 }
                 /* copy the upper diag tile into d_A */
                 magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib,
-                                   dd, ldd, da_ref(old_i, old_i), ldda);
+                                   dd, ldd, da_ref(old_i, old_i), ldda, stream);
             }
         }