diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c
index c000530185406e40af79d966cf1a357a11d70bc0..17f9168436a516c320f27aa9ded3a838c01f5ffe 100644
--- a/cudablas/compute/cuda_zgelqt.c
+++ b/cudablas/compute/cuda_zgelqt.c
@@ -142,6 +142,7 @@ int CUDA_zgelqt(
       magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
                         rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
                         lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
+      cudaThreadSynchronize();
       old_i = i;
       old_ib = ib;
       if (i+nb >= k){
diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c
index 2d5a5d53d021d3c27803745fe2ad8d4194c6a3c5..2e6cba0fb3c7afffa2bcf4807d2cc0ad1f33e083 100644
--- a/cudablas/compute/cuda_zgemerge.c
+++ b/cudablas/compute/cuda_zgemerge.c
@@ -115,7 +115,6 @@ int CUDA_zgemerge(
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-//            cublasZcopy(i+1, cola, 1, colb, 1);
             cudaMemcpyAsync(colb , cola,
                             (i+1)*sizeof(cuDoubleComplex),
                             cudaMemcpyDeviceToDevice, stream);
@@ -124,7 +123,6 @@ int CUDA_zgemerge(
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-//            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
             cudaMemcpyAsync(colb+i , cola+i,
                             (M-i)*sizeof(cuDoubleComplex),
                             cudaMemcpyDeviceToDevice, stream);
diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c
index 5fb1e4e92fc34846411a7b898f7c53948176a859..aeba03dce7755cb8a0e988e08a6169f959baff45 100644
--- a/cudablas/compute/cuda_ztslqt.c
+++ b/cudablas/compute/cuda_ztslqt.c
@@ -163,6 +163,7 @@ int CUDA_ztslqt(
                     dwork, lddwork,
                     dwork + nb * lddwork, nb,
                     stream );
+            cudaThreadSynchronize();
             old_i = i;
             old_ib = ib;
         }
diff --git a/cudablas/compute/cuda_ztsqrt.c b/cudablas/compute/cuda_ztsqrt.c
index 8750e5a6bfd4e50d1a509d29a7cd3dd5662f2be8..d75503fcc5df2f095378e2ae022693c9aef47256 100644
--- a/cudablas/compute/cuda_ztsqrt.c
+++ b/cudablas/compute/cuda_ztsqrt.c
@@ -185,6 +185,7 @@ int CUDA_ztsqrt(
                     dwork, ib,
                     dwork + ib * cols, rows,
                     stream );
+            cudaThreadSynchronize();
             old_i = i;
             old_ib = ib;
         }