Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 2d5f122e authored by PRUVOST Florent's avatar PRUVOST Florent
Browse files

fix cuda_geqrt kernel, there are some stream synchonization issues

parent d61d602d
No related branches found
No related tags found
No related merge requests found
...@@ -43,11 +43,9 @@ int CUDA_zgeqrt( ...@@ -43,11 +43,9 @@ int CUDA_zgeqrt(
#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) #define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) #define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1))
int i, k, ib, lddwork, old_i, old_ib, rows, cols; int i, k, ib, old_i, old_ib, rows, cols;
double _Complex one=1.; double _Complex one=1.;
int i1, i2;
// int lwkopt = n * nb;
// hwork[0] = *((magmaDoubleComplex*) &lwkopt);
if (m < 0) { if (m < 0) {
return -1; return -1;
...@@ -63,113 +61,110 @@ int CUDA_zgeqrt( ...@@ -63,113 +61,110 @@ int CUDA_zgeqrt(
return MAGMA_SUCCESS; return MAGMA_SUCCESS;
} }
lddwork= k;
/* lower parts of little T must be zero: memset to 0 for simplicity */ /* lower parts of little T must be zero: memset to 0 for simplicity */
memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex)); memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex));
cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream); cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
/* copy first panel of A on the host */ if ( (nb > 1) && (nb < k) ) {
// cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex), /* Use blocked code initially */
// da_ref(0, 0), ldda, old_i = 0; old_ib = nb;
// v, ldv); for (i = 0; i < k-nb; i += nb) {
cudaMemcpy( v, da_ref(0,0),
m*min(nb,n)*sizeof(magmaDoubleComplex),
cudaMemcpyDeviceToHost);
/* Use blocked code initially */
for (i = 0; i < k; i += nb) {
ib = min(k-i, nb);
if (i+nb>=n) ib = min(n-i, nb);
rows = m-i;
if (i>0){ ib = min(k-i, nb);
rows = m -i;
magma_zgetmatrix_async( rows, ib,
da_ref(i,i), ldda,
v_ref(i,0), ldv, stream );
/* copy panel of A from device to host */ if (i>0){
// cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex), /* Apply H' to A(i:m,i+2*ib:n) from the left */
// da_ref(0, i), ldda, cols = n-old_i-2*old_ib;
// v, ldv);
/* copy panel of A from device to host */
cudaMemcpy( v, da_ref(0,i),
m*ib*sizeof(magmaDoubleComplex),
cudaMemcpyDeviceToHost);
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
if (cols > 0){
magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
m-old_i, cols, old_ib, m-old_i, cols, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i, old_i+2*old_ib), ldda, da_ref(old_i, old_i+2*old_ib), ldda,
dwork, cols); dwork, cols);
}
/* copy the upper diag tile into d_A */ /* store the diagonal */
CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib, magma_zsetmatrix_async( old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream); d, old_ib,
da_ref(old_i, old_i), ldda, stream );
} }
/* Form the triangular factor of the block reflector on the host
H = H(i) H(i+1) . . . H(i+ib-1) */
CORE_zgeqrt(rows, ib, ib,
(double _Complex*) v_ref(i,0), ldv,
(double _Complex*) t_ref(0,0), ldt,
(double _Complex*) tau+i,
(double _Complex*) hwork);
if ( i + ib < n ){ magma_queue_sync( stream );
/* put 0s in the upper triangular part of a panel (and 1s on the /* Form the triangular factor of the block reflector on the host
diagonal); copy the upper triangular in d */ H = H(i) H(i+1) . . . H(i+ib-1) */
CORE_zgeqrt(rows, ib, ib,
(double _Complex*) v_ref(i, 0), ldv,
(double _Complex*) t_ref(0, 0), ib,
(double _Complex*) tau+i,
(double _Complex*) hwork);
/* Put 0s in the upper triangular part of a panel (and 1s on the
diagonal); copy the upper triangular in d. */
CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib, CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib,
(double _Complex*) v_ref(i,0), ldv, (double _Complex*) v_ref(i, 0), ldv,
(double _Complex*) d, ldd); (double _Complex*) d, ib);
/* copy from host to device a tile diag */ /* send the custom panel to the GPU */
cublasSetMatrix( min(rows,ib), ib, sizeof(magmaDoubleComplex), magma_zsetmatrix( rows, ib,
d, ldd, dd, ldd ); v_ref(i, 0), ldv,
} da_ref(i, i), ldda );
/* Send the triangular factor T to the GPU */ if ( i + ib < n ){
cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), /* Send the triangular factor T to the GPU */
t_ref(0,0), ldt, dt_ref(0,i), lddt ); magma_zsetmatrix( ib, ib,
t_ref(0, 0), ib,
/* A panel (with zeros in upper tri of its diag) is ready to be used dt_ref(0, i), lddt );
in input of zlarfb_gpu: we send the panel to the gpu */
cublasSetMatrix( rows, ib, sizeof(magmaDoubleComplex), if (i+nb < k-nb) {
v_ref(i,0), ldv, da_ref(i,i), ldda ); /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
if (i + ib < n) {
if (i+2*ib < n){
cols = ib;
}
else{
cols = n-i-ib;
}
/* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
lddt, da_ref(i,i+ib), ldda, dwork, cols);
old_i = i;
old_ib = ib;
if (i+nb>=k){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
cols = n-old_i-2*old_ib;
if (cols > 0){
magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
rows, cols, old_ib, rows, ib, ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, da_ref(i,i), ldda, dt_ref(0,i), lddt,
da_ref(old_i, old_i+2*old_ib), ldda, da_ref(i,i+ib), ldda, dwork, ib);
dwork, cols);
} }
/* copy the upper diag tile into d_A */ else {
CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib, cols = n-i-ib;
dd, ldd, da_ref(old_i, old_i), ldda, stream); magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
rows, cols, ib,
da_ref(i,i), ldda, dt_ref(0,i), lddt,
da_ref(i,i+ib), ldda, dwork, cols);
cudaThreadSynchronize();
/* Fix the diagonal block */
magma_zsetmatrix_async( ib, ib,
d, ib,
da_ref(i, i), ldda,
stream );
}
old_i = i;
old_ib = ib;
} }
} }
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = n-i;
rows = m-i;
magma_zgetmatrix( rows, ib,
da_ref(i,i), ldda,
v_ref(i,0), ldv );
CORE_zgeqrt(rows, ib, ib,
(double _Complex*) v_ref(i, 0), ldv,
(double _Complex*) t_ref(0, 0), ib,
(double _Complex*) tau+i,
(double _Complex*) hwork);
/* send the last factorized panel to the GPU */
magma_zsetmatrix( rows, ib,
v_ref(i, 0), ldv,
da_ref(i, i), ldda );
/* Send the triangular factor T to the GPU */
magma_zsetmatrix( ib, ib,
t_ref(0, 0), ib,
dt_ref(0, i), lddt );
} }
#undef da_ref #undef da_ref
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment