diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c index 973612e7ad05331f78e8e73744f23cc6caac7ceb..3bd8d9e7c8f686602db7b4233a557fbc44c30014 100644 --- a/compute/pztpgqrt.c +++ b/compute/pztpgqrt.c @@ -58,7 +58,7 @@ void morse_pztpgqrt( int L, int ib, minMT; /* Dimension of the first column */ - int maxm = Q2->m - L; + int maxm = chameleon_max( Q2->m - L, 1 ); int maxmt = (maxm % Q2->mb == 0) ? (maxm / Q2->mb) : (maxm / Q2->mb + 1); int maxmtk; diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c index 25af2c85e1ed60ba4b9ddbc39a9bb3d167a7e6c7..1807857a266542ca2a902ea6511df151fcee439c 100644 --- a/compute/pztpqrt.c +++ b/compute/pztpqrt.c @@ -45,7 +45,7 @@ void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, int ib; /* Dimension of the first column */ - int maxm = B->m - L; + int maxm = chameleon_max( B->m - L, 1 ); int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1); morse = morse_context_self(); diff --git a/coreblas/compute/core_zttmqr.c b/coreblas/compute/core_zttmqr.c index 95b8aa2df576d45ecbc8bb8895dc4327889e31c9..95760f3cd10868775a03ca2ac732f8888cf0cbef 100644 --- a/coreblas/compute/core_zttmqr.c +++ b/coreblas/compute/core_zttmqr.c @@ -3,9 +3,8 @@ * @copyright (c) 2009-2014 The University of Tennessee and The University * of Tennessee Research Foundation. * All rights reserved. - * @copyright (c) 2012-2014 Inria. All rights reserved. - * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. - * + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. **/ /** @@ -247,10 +246,10 @@ int CORE_zttmqr(MORSE_enum side, MORSE_enum trans, CORE_zparfb( side, trans, MorseForward, MorseColumnwise, mi1, ni1, mi2, ni2, kb, l, - &A1[LDA1*jc+ic], LDA1, + A1 + LDA1*jc+ic, LDA1, A2, LDA2, - &V[LDV*i], LDV, - &T[LDT*i], LDT, + V + LDV*i, LDV, + T + LDT*i, LDT, WORK, LDWORK); } return MORSE_SUCCESS; diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 40686c92fb785f93f8b7ec6de0f545e4759ec770..06d25bd476f7d7f5286e40eefd583cee53d2de2a 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -43,6 +43,7 @@ set(ZSRC cuda_ztrsm.c cuda_ztsmlq.c cuda_ztsmqr.c + cuda_zttmqr.c cuda_zunmlqt.c cuda_zunmqrt.c ) diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 83e0b393aedc1488a37d5a7609f75f52b204924f..edca0c8d7e13b14995ac2c74857cdfa5106e38ca 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -25,6 +25,130 @@ #include "cudablas/include/cudablas.h" #include "cudablas/include/cudablas_z.h" +/** + ***************************************************************************** + * + * @ingroup CUDA_MORSE_Complex64_t + * + * CUDA_zparfb applies a complex upper triangular block reflector H + * or its transpose H' to a complex rectangular matrix formed by + * coupling two tiles A1 and A2. Matrix V is: + * + * COLUMNWISE ROWWISE + * + * | K | | N2-L | L | + * __ _____________ __ __ _________________ __ + * | | | | | \ + * | | | | | \ L + * M2-L | | | K |_______________|_____\ __ + * | | | M2 | | + * __ |____| | | | K-L + * \ | | __ |______________________| __ + * L \ | | + * __ \|______| __ | N2 | + * + * | L | K-L | + * + ******************************************************************************* + * + * @param[in] side + * @arg MorseLeft : apply Q or Q**H from the Left; + * @arg MorseRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg MorseNoTrans : No transpose, apply Q; + * @arg MorseConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] direct + * Indicates how H is formed from a product of elementary + * reflectors + * @arg MorseForward : H = H(1) H(2) . . . H(k) (Forward) + * @arg MorseBackward : H = H(k) . . . H(2) H(1) (Backward) + * + * @param[in] storev + * Indicates how the vectors which define the elementary + * reflectors are stored: + * @arg MorseColumnwise + * @arg MorseRowwise + * + * @param[in] M1 + * The number of columns of the tile A1. M1 >= 0. + * + * @param[in] N1 + * The number of rows of the tile A1. N1 >= 0. + * + * @param[in] M2 + * The number of columns of the tile A2. M2 >= 0. + * + * @param[in] N2 + * The number of rows of the tile A2. N2 >= 0. + * + * @param[in] K + * The order of the matrix T (= the number of elementary + * reflectors whose product defines the block reflector). + * + * @param[in] L + * The size of the triangular part of V + * + * @param[in,out] A1 + * On entry, the M1-by-N1 tile A1. + * On exit, A1 is overwritten by the application of Q. + * + * @param[in] LDA1 + * The leading dimension of the array A1. LDA1 >= max(1,N1). + * + * @param[in,out] A2 + * On entry, the M2-by-N2 tile A2. + * On exit, A2 is overwritten by the application of Q. + * + * @param[in] LDA2 + * The leading dimension of the tile A2. LDA2 >= max(1,N2). + * + * @param[in] V + * (LDV,K) if STOREV = 'C' + * (LDV,M2) if STOREV = 'R' and SIDE = 'L' + * (LDV,N2) if STOREV = 'R' and SIDE = 'R' + * Matrix V. + * + * @param[in] LDV + * The leading dimension of the array V. + * If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M2); + * if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N2); + * if STOREV = 'R', LDV >= K. + * + * @param[out] T + * The triangular K-by-K matrix T in the representation of the + * block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] LDT + * The leading dimension of the array T. LDT >= K. + * + * @param[in,out] WORK + * Workspace of dimension LDWORK-by-N1 if side == MorseLeft, LDWORK-by-K + * otherwise. + * + * @param[in] LDWORK + * The leading dimension of the array WORK: LDWORK >= K, if side == + * MorseLeft, LDWORK >= M1 otehrwise. + * + * @param[in,out] WORKC + * Optionnal additional workspace to replace the TRMM operation by a GEMM kernel. + * This workspace is of dimension LDWORK-by-K if side == MorseLeft, LDWORK-by-N2 + * otherwise. + * + * @param[in] LDWORKC + * The leading dimension of the array WORKC: LDWORKC >= M2, if side == + * MorseLeft, LDWORK >= K otehrwise. + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************/ int CUDA_zparfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, @@ -83,6 +207,10 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, if (K < 0) { return -9; } + if ( ((LDWORK < K ) && (side == MorseLeft )) || + ((LDWORK < M1) && (side == MorseRight)) ) { + return -20; + } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c index b9f6afac4cabd9a66c017c29e16abad91e7a2b97..350dc3df8dd730314599cc023c9228066e8e58ec 100644 --- a/cudablas/compute/cuda_ztpmqrt.c +++ b/cudablas/compute/cuda_ztpmqrt.c @@ -48,14 +48,14 @@ CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans, n1 = N; ldwork = IB; ldworkc = M; - ws = K * n1; + ws = ldwork * n1; } else { m1 = M; n1 = K; ldwork = m1; ldworkc = IB; - ws = m1 * K; + ws = ldwork * IB; } /* TS case */ @@ -67,16 +67,14 @@ CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans, } /* TT case */ else if( L == M ) { - cudablas_error(-6, "TTMQRT not available on GPU yet\n" ); - return -6; - /* CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, */ - /* A, LDA, B, LDB, V, LDV, T, LDT, */ - /* WORK, ldwork ); */ + CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, + A, LDA, B, LDB, V, LDV, T, LDT, + WORK, ldwork, WORK + ws, ldworkc, + CUBLAS_STREAM_VALUE ); } else { - cudablas_error(-6, "TPMQRT not available on GPU yet\n" ); + cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" ); return -6; - //LAPACKE_ztpmqrt_work( LAPACK_COL_MAJOR, M, N, K, L, IB, V, LDV, T, LDT, A, LDA, B, LDB, WORK ); } return MORSE_SUCCESS; diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c new file mode 100644 index 0000000000000000000000000000000000000000..04f59bf8140801312f635e3d801efd4ff23cbcaf --- /dev/null +++ b/cudablas/compute/cuda_zttmqr.c @@ -0,0 +1,151 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + **/ + +/** + * + * @file cuda_zttmqr.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @author Mathieu Faverge + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" +#include "cudablas/include/cudablas_z.h" + +int CUDA_zttmqr( + MORSE_enum side, MORSE_enum trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LDWORK, + cuDoubleComplex *WORKC, int LDWORKC, + CUBLAS_STREAM_PARAM) +{ + int i, i1, i3, l; + int NQ, NW; + int kb; + int ic = 0; + int jc = 0; + int mi1 = M1; + int mi2 = M2; + int ni1 = N1; + int ni2 = N2; + + /* Check input arguments */ + if ((side != MorseLeft) && (side != MorseRight)) { + return -1; + } + + /* NQ is the order of Q */ + if (side == MorseLeft) { + NQ = M2; + NW = IB; + } + else { + NQ = N2; + NW = M1; + } + + if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == MorseRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == MorseLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == MorseLeft) && (K > M1) ) || + ( (side == MorseRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < chameleon_max(1,M1)){ + return -10; + } + if (LDA2 < chameleon_max(1,M2)){ + return -12; + } + if (LDV < chameleon_max(1,NQ)){ + return -14; + } + if (LDT < chameleon_max(1,IB)){ + return -16; + } + if (LDWORK < chameleon_max(1,NW)){ + return -18; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + return MORSE_SUCCESS; + + if (((side == MorseLeft) && (trans != MorseNoTrans)) + || ((side == MorseRight) && (trans == MorseNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ((K-1) / IB)*IB; + i3 = -IB; + } + + for(i = i1; (i > -1) && (i < K); i += i3) { + kb = chameleon_min(IB, K-i); + + if (side == MorseLeft) { + mi1 = kb; + mi2 = chameleon_min(i+kb, M2); + l = chameleon_min(kb, chameleon_max(0, M2-i)); + ic = i; + } + else { + ni1 = kb; + ni2 = chameleon_min(i+kb, N2); + l = chameleon_min(kb, chameleon_max(0, N2-i)); + jc = i; + } + + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb) + */ + CUDA_zparfb( + side, trans, MorseForward, MorseColumnwise, + mi1, ni1, mi2, ni2, kb, l, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LDWORK, + WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + } + return MORSE_SUCCESS; +} diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h index e73912750cb83353c4f013520c5c0db7a55ba7dc..911d1ef93328392186f6063e66e9b21572596844 100644 --- a/cudablas/include/cudablas_z.h +++ b/cudablas/include/cudablas_z.h @@ -51,6 +51,7 @@ int CUDA_ztrmm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum int CUDA_ztrsm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); +int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); int CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index 665628cf4f9283f0dea3bcbe67dd3ebda5b8f590..ed4f7dbe9303d36cef5385dd33cddb54b9583fd4 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -281,8 +281,8 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldv, &ldt, &ldwork); - WC = W + ib * (side == MorseLeft ? n1 : m1); - ldworkc = (side == MorseLeft) ? m1 : ib; + WC = W + ib * (side == MorseLeft ? m1 : n1); + ldworkc = (side == MorseLeft) ? m2 : ib; stream = starpu_cuda_get_local_stream(); cublasSetKernelStream( stream ); @@ -299,7 +299,6 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ - /* * Codelet definition */ diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c index 3368c6da7ecfb8bd00a7d269d98e88f487159144..6f2f600a8c83f1b299df8a951bddcb4abe019a7e 100644 --- a/runtime/starpu/codelets/codelet_zttmqr.c +++ b/runtime/starpu/codelets/codelet_zttmqr.c @@ -239,9 +239,59 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg) CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } + +#if defined(CHAMELEON_USE_CUDA) +static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + cuDoubleComplex *A1; + int lda1; + cuDoubleComplex *A2; + int lda2; + cuDoubleComplex *V; + int ldv; + cuDoubleComplex *T; + int ldt; + cuDoubleComplex *W, *WC; + int ldwork; + int ldworkc; + CUstream stream; + + A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + V = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); + W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ + + starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, + &lda1, &lda2, &ldv, &ldt, &ldwork); + + WC = W + ib * (side == MorseLeft ? m1 : n1); + ldworkc = (side == MorseLeft) ? m2 : ib; + + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + + CUDA_zttmqr( + side, trans, m1, n1, m2, n2, k, ib, + A1, lda1, A2, lda2, V, ldv, T, ldt, + W, ldwork, WC, ldworkc, stream ); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zttmqr, 5, cl_zttmqr_cpu_func) +CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC)