From 7034ee8ffd5f91163f1c90c5f8535ae79f362c27 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Mon, 10 Apr 2017 23:27:28 +0200
Subject: [PATCH] Add geadd GPU kernel

---
 cudablas/compute/CMakeLists.txt          |  7 ++++
 cudablas/compute/cuda_zgeadd.c           | 23 +++++------
 cudablas/include/cudablas_z.h            | 51 ++++++++++++------------
 runtime/starpu/codelets/codelet_zgeadd.c | 38 ++++++++++++++++++
 4 files changed, 82 insertions(+), 37 deletions(-)

diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 39a9d20be..20e012939 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -48,6 +48,13 @@ set(ZSRC
     cuda_zunmqrt.c
     )
 
+if( CHAMELEON_USE_CUBLAS_V2 )
+  set(ZSRC
+    ${ZSRC}
+    cuda_zgeadd.c
+    )
+endif( CHAMELEON_USE_CUBLAS_V2 )
+
 if( CHAMELEON_USE_MAGMA )
   set(ZSRC
     ${ZSRC}
diff --git a/cudablas/compute/cuda_zgeadd.c b/cudablas/compute/cuda_zgeadd.c
index 7707f09e1..43b27291f 100644
--- a/cudablas/compute/cuda_zgeadd.c
+++ b/cudablas/compute/cuda_zgeadd.c
@@ -29,21 +29,20 @@
 #error "This file requires cublas api v2 support"
 #endif
 
-int CUDA_zgeadd(MORSE_enum transa, MORSE_enum transb,
-               int m, int n, int k,
-               cuDoubleComplex *alpha,
-               const cuDoubleComplex *A, int lda,
-               const cuDoubleComplex *B, int ldb,
-               cuDoubleComplex *beta,
-               cuDoubleComplex *C, int ldc,
-               CUBLAS_STREAM_PARAM)
+int CUDA_zgeadd(MORSE_enum trans,
+                int m, int n,
+                const cuDoubleComplex *alpha,
+                const cuDoubleComplex *A, int lda,
+                const cuDoubleComplex *beta,
+                cuDoubleComplex *B, int ldb,
+                CUBLAS_STREAM_PARAM)
 {
     cublasZgeam(CUBLAS_HANDLE
-                morse_cublas_const(transa), morse_cublas_const(transb),
-                m, n, k,
+                morse_cublas_const(trans), morse_cublas_const(MorseNoTrans),
+                m, n,
                 CUBLAS_VALUE(alpha), A, lda,
-                                     B, ldb,
-                CUBLAS_VALUE(beta),  C, ldc);
+                CUBLAS_VALUE(beta),  B, ldb,
+                B, ldb);
 
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
 
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
index 911d1ef93..7d9ccbbf7 100644
--- a/cudablas/include/cudablas_z.h
+++ b/cudablas/include/cudablas_z.h
@@ -35,39 +35,40 @@ extern "C" {
 /** ****************************************************************************
  *  Declarations of cuda kernels - alphabetical order
  **/
-int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM);
-int CUDA_zgemm(  MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
-int CUDA_zhemm(  MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
-int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zgeadd( MORSE_enum trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
+int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM );
+int CUDA_zgemm(  MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
+int CUDA_zhemm(  MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
+int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_zherfb( MORSE_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM );
-int CUDA_zherk(  MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
+int CUDA_zherk(  MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
 int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zparfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
-int CUDA_zsymm(  MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
-int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
-int CUDA_zsyrk(  MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zsymm(  MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
+int CUDA_zsyr2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
+int CUDA_zsyrk(  MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM );
-int CUDA_ztrmm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
-int CUDA_ztrsm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
-int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
-int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
-int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
+int CUDA_ztrmm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
+int CUDA_ztrsm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
 int CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 
 #if defined(CHAMELEON_USE_MAGMA)
-int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
-int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
-int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
-int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
-int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
-int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
-int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
-int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info);
-int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
-int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
-int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
-int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
+int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM );
+int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM );
+int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info );
+int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info );
+int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info );
+int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info );
+int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info );
+int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info );
+int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info );
+int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM );
+int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM );
+int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info );
 #endif
 
 #ifdef __cplusplus
diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index edfd5461b..a65b6daf5 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -137,9 +137,47 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg)
     CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB);
     return;
 }
+
+#ifdef CHAMELEON_USE_CUBLAS_V2
+static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum trans;
+    int M;
+    int N;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    cuDoubleComplex beta;
+    cuDoubleComplex *B;
+    int ldb;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &lda, &beta, &ldb);
+
+    RUNTIME_getStream( stream );
+
+    CUDA_zgeadd(
+        trans,
+        M, N,
+        &alpha, A, lda,
+        &beta,  B, ldb,
+        stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif /* defined(CHAMELEON_USE_CUBLAS_V2) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+CODELETS(zgeadd, 2, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC)
+#else
 CODELETS_CPU(zgeadd, 2, cl_zgeadd_cpu_func)
+#endif
-- 
GitLab