diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 39a9d20be5bb4f73d6892417af809b2db0f9be90..20e012939889e6f16ea762c5f432c39c3e61f675 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -48,6 +48,13 @@ set(ZSRC cuda_zunmqrt.c ) +if( CHAMELEON_USE_CUBLAS_V2 ) + set(ZSRC + ${ZSRC} + cuda_zgeadd.c + ) +endif( CHAMELEON_USE_CUBLAS_V2 ) + if( CHAMELEON_USE_MAGMA ) set(ZSRC ${ZSRC} diff --git a/cudablas/compute/cuda_zgeadd.c b/cudablas/compute/cuda_zgeadd.c new file mode 100644 index 0000000000000000000000000000000000000000..4e728ad49c1955256ec7926e8fa3b219f61d7a58 --- /dev/null +++ b/cudablas/compute/cuda_zgeadd.c @@ -0,0 +1,102 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ +/** + * + * @file cuda_zgeadd.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Mathieu Faverge + * @date 2015-09-17 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" +#include "cudablas/include/cudablas_z.h" + +#if !defined(CHAMELEON_USE_CUBLAS_V2) +#error "This file requires cublas api v2 support" +#endif + +/** + ****************************************************************************** + * + * @ingroup CUDA_MORSE_Complex64_t + * + * CUDA_zgeadd adds to matrices together as in PBLAS pzgeadd. + * + * B <- alpha * op(A) + beta * B, + * + * where op(X) = X, X', or conj(X') + * + ******************************************************************************* + * + * @param[in] trans + * Specifies whether the matrix A is non-transposed, transposed, or + * conjugate transposed + * = MorseNoTrans: op(A) = A + * = MorseTrans: op(A) = A' + * = MorseConjTrans: op(A) = conj(A') + * + * @param[in] M + * Number of rows of the matrices op(A) and B. + * + * @param[in] N + * Number of columns of the matrices op(A) and B. + * + * @param[in] alpha + * Scalar factor of A. + * + * @param[in] A + * Matrix of size LDA-by-N, if trans = MorseNoTrans, LDA-by-M + * otherwise. + * + * @param[in] LDA + * Leading dimension of the array A. LDA >= max(1,k), with k=M, if + * trans = MorseNoTrans, and k=N otherwise. + * + * @param[in] beta + * Scalar factor of B. + * + * @param[in,out] B + * Matrix of size LDB-by-N. + * On exit, B = alpha * op(A) + beta * B + * + * @param[in] LDB + * Leading dimension of the array B. LDB >= max(1,M) + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************/ +int CUDA_zgeadd(MORSE_enum trans, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + CUBLAS_STREAM_PARAM) +{ + cublasZgeam(CUBLAS_HANDLE + morse_cublas_const(trans), morse_cublas_const(MorseNoTrans), + m, n, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb, + B, ldb); + + assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + + return MORSE_SUCCESS; +} diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h index 911d1ef93328392186f6063e66e9b21572596844..7d9ccbbf7228ff7f8ba48929e345ee1bf74469d5 100644 --- a/cudablas/include/cudablas_z.h +++ b/cudablas/include/cudablas_z.h @@ -35,39 +35,40 @@ extern "C" { /** **************************************************************************** * Declarations of cuda kernels - alphabetical order **/ -int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM); -int CUDA_zgemm( MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zhemm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zgeadd( MORSE_enum trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM ); +int CUDA_zgemm( MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zhemm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zherfb( MORSE_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); -int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); +int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zparfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_zsymm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zsyrk( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zsymm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zsyrk( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM ); -int CUDA_ztrmm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); -int CUDA_ztrsm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); -int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); -int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); -int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); +int CUDA_ztrmm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_ztrsm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); int CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); #if defined(CHAMELEON_USE_MAGMA) -int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); -int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info); -int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); +int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info ); +int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info ); +int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info ); #endif #ifdef __cplusplus diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index edfd5461b71160c28c5361926ff63dd8d15e8d1c..a65b6daf5345bf539002703dea82ff61d9244905 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -137,9 +137,47 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB); return; } + +#ifdef CHAMELEON_USE_CUBLAS_V2 +static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum trans; + int M; + int N; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int lda; + cuDoubleComplex beta; + cuDoubleComplex *B; + int ldb; + + A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &lda, &beta, &ldb); + + RUNTIME_getStream( stream ); + + CUDA_zgeadd( + trans, + M, N, + &alpha, A, lda, + &beta, B, ldb, + stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif /* defined(CHAMELEON_USE_CUBLAS_V2) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_CUBLAS_V2) +CODELETS(zgeadd, 2, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) +#else CODELETS_CPU(zgeadd, 2, cl_zgeadd_cpu_func) +#endif