diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c index c000530185406e40af79d966cf1a357a11d70bc0..17f9168436a516c320f27aa9ded3a838c01f5ffe 100644 --- a/cudablas/compute/cuda_zgelqt.c +++ b/cudablas/compute/cuda_zgelqt.c @@ -142,6 +142,7 @@ int CUDA_zgelqt( magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), lddt, da_ref(i+ib,i), ldda, dwork, lddwork); + cudaThreadSynchronize(); old_i = i; old_ib = ib; if (i+nb >= k){ diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c index 2d5a5d53d021d3c27803745fe2ad8d4194c6a3c5..2e6cba0fb3c7afffa2bcf4807d2cc0ad1f33e083 100644 --- a/cudablas/compute/cuda_zgemerge.c +++ b/cudablas/compute/cuda_zgemerge.c @@ -115,7 +115,6 @@ int CUDA_zgemerge( for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; -// cublasZcopy(i+1, cola, 1, colb, 1); cudaMemcpyAsync(colb , cola, (i+1)*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice, stream); @@ -124,7 +123,6 @@ int CUDA_zgemerge( for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; -// cublasZcopy(M-i, cola + i, 1, colb + i, 1); cudaMemcpyAsync(colb+i , cola+i, (M-i)*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice, stream); diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c index 5fb1e4e92fc34846411a7b898f7c53948176a859..aeba03dce7755cb8a0e988e08a6169f959baff45 100644 --- a/cudablas/compute/cuda_ztslqt.c +++ b/cudablas/compute/cuda_ztslqt.c @@ -163,6 +163,7 @@ int CUDA_ztslqt( dwork, lddwork, dwork + nb * lddwork, nb, stream ); + cudaThreadSynchronize(); old_i = i; old_ib = ib; } diff --git a/cudablas/compute/cuda_ztsqrt.c b/cudablas/compute/cuda_ztsqrt.c index 8750e5a6bfd4e50d1a509d29a7cd3dd5662f2be8..d75503fcc5df2f095378e2ae022693c9aef47256 100644 --- a/cudablas/compute/cuda_ztsqrt.c +++ b/cudablas/compute/cuda_ztsqrt.c @@ -185,6 +185,7 @@ int CUDA_ztsqrt( dwork, ib, dwork + ib * cols, rows, stream ); + cudaThreadSynchronize(); old_i = i; old_ib = ib; }