diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 5fc04a48ee7e0fdab1bd665bbeb2efaf09dd84e3..abd6f7fd8b8cca8dea9fe83fdc1bd745d9859d30 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -21,7 +21,6 @@ #include "cudablas.h" /** - ***************************************************************************** * * @ingroup CUDA_CHAMELEON_Complex64_t * @@ -137,15 +136,16 @@ * @retval <0 if -i, the i-th argument had an illegal value * @retval The recommended LWORK value, if LWORK == -1 on entry. */ +/* This kernel is never traced so return type on previous line for convert2eztrace.pl script */ int CUDA_zparfb( cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) @@ -161,7 +161,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cuDoubleComplex *workW, *workC, *workV; cublasStatus_t rc; cudaStream_t stream; - int j, ldW, ldC, ldV; + int i, j, ldW, ldC, ldV; cham_trans_t transW, transA2; int wssize = 0; int wrsize = 0; @@ -213,6 +213,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return -20; } + /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) { return CHAMELEON_SUCCESS; } @@ -266,10 +267,11 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, M2 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); - for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j, - 0, - (L - j) * sizeof(cuDoubleComplex), + j = 0; + i = M2 - L + 1; + for(; (i < M2) && (j < K); i++, j++ ) { + cudaMemsetAsync( workV + j * ldV + i, 0, + (M2 - i) * sizeof(cuDoubleComplex), stream ); } } @@ -277,7 +279,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ldV = K; /* - * Backup V, and put 0 in the lower part + * Backup V, and put 0 in the upper part */ cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), V, LDV * sizeof(cuDoubleComplex), @@ -285,8 +287,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cudaMemcpyDeviceToDevice, stream ); for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + ldV * ( M2 - L + j ), - 0, + cudaMemsetAsync( workV + ldV * ( M2 - L + j ), 0, j * sizeof(cuDoubleComplex), stream ); } @@ -414,10 +415,11 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, N2 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); - for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + (j-1) * ldV + N2 - L + j, - 0, - (L - j) * sizeof(cuDoubleComplex), + j = 0; + i = N2 - L + 1; + for(; (i < N2) && (j < K); i++, j++ ) { + cudaMemsetAsync( workV + j * ldV + i, 0, + (N2 - i) * sizeof(cuDoubleComplex), stream ); } } @@ -433,8 +435,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cudaMemcpyDeviceToDevice, stream ); for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + ldV * ( N2 - L + j ), - 0, + cudaMemsetAsync( workV + ldV * ( N2 - L + j ), 0, j * sizeof(cuDoubleComplex), stream ); } @@ -524,6 +525,5 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return CHAMELEON_ERR_NOT_SUPPORTED; } - (void)L; return CHAMELEON_SUCCESS; }