From ca9728b1f8b1f401c5240ab71900a6865f94eee8 Mon Sep 17 00:00:00 2001 From: Florent Pruvost <florent.pruvost@inria.fr> Date: Mon, 11 Apr 2016 17:49:09 +0000 Subject: [PATCH] add some synchro in cuda gelqt, tslqt and tsqrt kernel to improve algo robustness --- cudablas/compute/cuda_zgelqt.c | 1 + cudablas/compute/cuda_zgemerge.c | 2 -- cudablas/compute/cuda_ztslqt.c | 1 + cudablas/compute/cuda_ztsqrt.c | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c index c00053018..17f916843 100644 --- a/cudablas/compute/cuda_zgelqt.c +++ b/cudablas/compute/cuda_zgelqt.c @@ -142,6 +142,7 @@ int CUDA_zgelqt( magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), lddt, da_ref(i+ib,i), ldda, dwork, lddwork); + cudaThreadSynchronize(); old_i = i; old_ib = ib; if (i+nb >= k){ diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c index 2d5a5d53d..2e6cba0fb 100644 --- a/cudablas/compute/cuda_zgemerge.c +++ b/cudablas/compute/cuda_zgemerge.c @@ -115,7 +115,6 @@ int CUDA_zgemerge( for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; -// cublasZcopy(i+1, cola, 1, colb, 1); cudaMemcpyAsync(colb , cola, (i+1)*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice, stream); @@ -124,7 +123,6 @@ int CUDA_zgemerge( for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; -// cublasZcopy(M-i, cola + i, 1, colb + i, 1); cudaMemcpyAsync(colb+i , cola+i, (M-i)*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice, stream); diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c index 5fb1e4e92..aeba03dce 100644 --- a/cudablas/compute/cuda_ztslqt.c +++ b/cudablas/compute/cuda_ztslqt.c @@ -163,6 +163,7 @@ int CUDA_ztslqt( dwork, lddwork, dwork + nb * lddwork, nb, stream ); + cudaThreadSynchronize(); old_i = i; old_ib = ib; } diff --git a/cudablas/compute/cuda_ztsqrt.c b/cudablas/compute/cuda_ztsqrt.c index 8750e5a6b..d75503fcc 100644 --- a/cudablas/compute/cuda_ztsqrt.c +++ b/cudablas/compute/cuda_ztsqrt.c @@ -185,6 +185,7 @@ int CUDA_ztsqrt( dwork, ib, dwork + ib * cols, rows, stream ); + cudaThreadSynchronize(); old_i = i; old_ib = ib; } -- GitLab