From 7034ee8ffd5f91163f1c90c5f8535ae79f362c27 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Mon, 10 Apr 2017 23:27:28 +0200 Subject: [PATCH] Add geadd GPU kernel --- cudablas/compute/CMakeLists.txt | 7 ++++ cudablas/compute/cuda_zgeadd.c | 23 +++++------ cudablas/include/cudablas_z.h | 51 ++++++++++++------------ runtime/starpu/codelets/codelet_zgeadd.c | 38 ++++++++++++++++++ 4 files changed, 82 insertions(+), 37 deletions(-) diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 39a9d20be..20e012939 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -48,6 +48,13 @@ set(ZSRC cuda_zunmqrt.c ) +if( CHAMELEON_USE_CUBLAS_V2 ) + set(ZSRC + ${ZSRC} + cuda_zgeadd.c + ) +endif( CHAMELEON_USE_CUBLAS_V2 ) + if( CHAMELEON_USE_MAGMA ) set(ZSRC ${ZSRC} diff --git a/cudablas/compute/cuda_zgeadd.c b/cudablas/compute/cuda_zgeadd.c index 7707f09e1..43b27291f 100644 --- a/cudablas/compute/cuda_zgeadd.c +++ b/cudablas/compute/cuda_zgeadd.c @@ -29,21 +29,20 @@ #error "This file requires cublas api v2 support" #endif -int CUDA_zgeadd(MORSE_enum transa, MORSE_enum transb, - int m, int n, int k, - cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int CUDA_zgeadd(MORSE_enum trans, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + CUBLAS_STREAM_PARAM) { cublasZgeam(CUBLAS_HANDLE - morse_cublas_const(transa), morse_cublas_const(transb), - m, n, k, + morse_cublas_const(trans), morse_cublas_const(MorseNoTrans), + m, n, CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + CUBLAS_VALUE(beta), B, ldb, + B, ldb); assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h index 911d1ef93..7d9ccbbf7 100644 --- a/cudablas/include/cudablas_z.h +++ b/cudablas/include/cudablas_z.h @@ -35,39 +35,40 @@ extern "C" { /** **************************************************************************** * Declarations of cuda kernels - alphabetical order **/ -int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM); -int CUDA_zgemm( MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zhemm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zgeadd( MORSE_enum trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM ); +int CUDA_zgemm( MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zhemm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zherfb( MORSE_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); -int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); +int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zparfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_zsymm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); -int CUDA_zsyrk( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zsymm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); +int CUDA_zsyrk( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM ); -int CUDA_ztrmm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); -int CUDA_ztrsm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); -int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); -int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); -int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); +int CUDA_ztrmm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_ztrsm( MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); int CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); #if defined(CHAMELEON_USE_MAGMA) -int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); -int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info); -int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); -int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); -int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); +int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info ); +int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info ); +int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info ); +int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info ); #endif #ifdef __cplusplus diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index edfd5461b..a65b6daf5 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -137,9 +137,47 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB); return; } + +#ifdef CHAMELEON_USE_CUBLAS_V2 +static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum trans; + int M; + int N; + cuDoubleComplex alpha; + const cuDoubleComplex *A; + int lda; + cuDoubleComplex beta; + cuDoubleComplex *B; + int ldb; + + A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &lda, &beta, &ldb); + + RUNTIME_getStream( stream ); + + CUDA_zgeadd( + trans, + M, N, + &alpha, A, lda, + &beta, B, ldb, + stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif /* defined(CHAMELEON_USE_CUBLAS_V2) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_CUBLAS_V2) +CODELETS(zgeadd, 2, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) +#else CODELETS_CPU(zgeadd, 2, cl_zgeadd_cpu_func) +#endif -- GitLab