Commit d3492f22 authored by Mathieu Faverge's avatar Mathieu Faverge

Fix columnwise/rowise issue to apply any unmqr on GPU

parent 14482041
...@@ -253,21 +253,42 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -253,21 +253,42 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
else { else {
workV = workC + M2 * K; workV = workC + M2 * K;
} }
ldV = M2;
if ( storev == ChamColumnwise ) {
/* ldV = M2;
* Backup V, and put 0 in the lower part
*/ /*
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), * Backup V, and put 0 in the lower part
V, LDV * sizeof(cuDoubleComplex), */
M2 * sizeof(cuDoubleComplex), K, cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
cudaMemcpyDeviceToDevice, stream ); V, LDV * sizeof(cuDoubleComplex),
M2 * sizeof(cuDoubleComplex), K,
for(j = 1; j < K; j++) { cudaMemcpyDeviceToDevice, stream );
cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
0., for(j = 1; j < K; j++) {
(L - j) * sizeof(cuDoubleComplex), cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
stream ); 0,
(L - j) * sizeof(cuDoubleComplex),
stream );
}
}
else {
ldV = K;
/*
* Backup V, and put 0 in the lower part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), M2,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + ldV * ( M2 - L + j ),
0,
j * sizeof(cuDoubleComplex),
stream );
}
} }
} }
...@@ -312,7 +333,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -312,7 +333,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
cublasZgemm( CUBLAS_HANDLE cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), V /* M2 * K */, LDV, CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV,
workW /* K * N2 */, ldW, workW /* K * N2 */, ldW,
CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 );
...@@ -375,21 +396,42 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -375,21 +396,42 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
else { else {
workV = workC + K * N2; workV = workC + K * N2;
} }
ldV = K;
if ( storev == ChamColumnwise ) {
/* ldV = N2;
* Backup V, and put 0 in the upper part
*/ /*
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), * Backup V, and put 0 in the lower part
V, LDV * sizeof(cuDoubleComplex), */
K * sizeof(cuDoubleComplex), N2, cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
cudaMemcpyDeviceToDevice, stream ); V, LDV * sizeof(cuDoubleComplex),
N2 * sizeof(cuDoubleComplex), K,
for(j = 1; j < K; j++) { cudaMemcpyDeviceToDevice, stream );
cudaMemsetAsync( workV + ldV + N2 - L + j,
0., for(j = 1; j < K; j++) {
j * sizeof(cuDoubleComplex), cudaMemsetAsync( workV + (j-1) * ldV + N2 - L + j,
stream ); 0,
(L - j) * sizeof(cuDoubleComplex),
stream );
}
}
else {
ldV = K;
/*
* Backup V, and put 0 in the upper part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), N2,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + ldV * ( N2 - L + j ),
0,
j * sizeof(cuDoubleComplex),
stream );
}
} }
} }
...@@ -435,7 +477,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -435,7 +477,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, CUBLAS_SADDR(mzone), workW /* M2*K */, ldW,
V /* K *N2 */, LDV, workV /* K *N2 */, ldV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
} else { } else {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment