From 3283225b1bb24dee2757594ccb1abb701a42638a Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 6 Oct 2020 11:48:40 +0200
Subject: [PATCH] Add BLAS 2 xgemv kernel

---
 coreblas/compute/CMakeLists.txt             |   1 +
 coreblas/compute/core_zgemv.c               |  98 +++++++++++++++
 coreblas/compute/core_ztile.c               |  13 ++
 coreblas/include/coreblas/coreblas_z.h      |   4 +
 coreblas/include/coreblas/coreblas_ztile.h  |   1 +
 include/chameleon/tasks_z.h                 |   5 +
 runtime/CMakeLists.txt                      |   4 +
 runtime/openmp/codelets/codelet_zgemv.c     |  37 ++++++
 runtime/parsec/codelets/codelet_zgemv.c     |  76 +++++++++++
 runtime/quark/codelets/codelet_zgemv.c      |  66 ++++++++++
 runtime/starpu/codelets/codelet_zcallback.c |   1 +
 runtime/starpu/codelets/codelet_zgemv.c     | 133 ++++++++++++++++++++
 runtime/starpu/include/runtime_codelet_z.h  |   5 +
 13 files changed, 444 insertions(+)
 create mode 100644 coreblas/compute/core_zgemv.c
 create mode 100644 runtime/openmp/codelets/codelet_zgemv.c
 create mode 100644 runtime/parsec/codelets/codelet_zgemv.c
 create mode 100644 runtime/quark/codelets/codelet_zgemv.c
 create mode 100644 runtime/starpu/codelets/codelet_zgemv.c

diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index dd70cd7c7..f0d4976c2 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -37,6 +37,7 @@ set(ZSRC
     core_zlascal.c
     core_zlatm1.c
     core_zgelqt.c
+    core_zgemv.c
     core_zgemm.c
     core_zgeqrt.c
     core_zgesplit.c
diff --git a/coreblas/compute/core_zgemv.c b/coreblas/compute/core_zgemv.c
new file mode 100644
index 000000000..8c25d3dba
--- /dev/null
+++ b/coreblas/compute/core_zgemv.c
@@ -0,0 +1,98 @@
+/**
+ *
+ * @file core_zgemv.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon core_zgemv CPU kernel
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2020-10-06
+ * @precisions normal z -> c d s
+ *
+ */
+#include "coreblas.h"
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ *  Performs one of the matrix-vector operations
+ *
+ *    \f[ y = \alpha [op( A ) \times x] + \beta y \f],
+ *
+ *  where op( A ) is one of:
+ *    \f[ op( A ) = A,   \f]
+ *    \f[ op( A ) = A^T, \f]
+ *    \f[ op( A ) = A^H, \f]
+ *
+ *  alpha and beta are scalars, op(A) an m-by-n matrix, and x and y are two vectors.
+ *
+ *******************************************************************************
+ *
+ * @param[in] trans
+ *          - ChamNoTrans:   A is not transposed,
+ *          - ChamTrans:     A is transposed,
+ *          - ChamConjTrans: A is conjugate transposed.
+ *
+ * @param[in] M
+ *          The number of rows of the matrix A. M >= 0.
+ *
+ * @param[in] N
+ *          The number of columns of the matrix A. N >= 0.
+ *
+ * @param[in] alpha
+ *          The scalar alpha.
+ *
+ * @param[in] A
+ *          An lda-by-n matrix, where only the m-by-n leading entries are references.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,M).
+ *
+ * @param[in] x
+ *          X is COMPLEX*16 array, dimension at least
+ *          ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
+ *          and at least
+ *          ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
+ *          Before entry, the incremented array X must contain the
+ *          vector x.
+ *
+ * @param[in] incX
+ *          On entry, INCX specifies the increment for the elements of
+ *          X. INCX must not be zero.
+ *
+ * @param[in] beta
+ *          The scalar beta.
+ *
+ * @param[in] y
+ *          Y is COMPLEX*16 array, dimension at least
+ *          ( 1 + ( n - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
+ *          and at least
+ *          ( 1 + ( m - 1 )*abs( INCY ) ) otherwise.
+ *          Before entry, the incremented array Y must contain the vector y with
+ *          beta != 0. On exit, Y is overwritten by the updated vector.
+ *
+ * @param[in] incY
+ *          On entry, INCY specifies the increment for the elements of
+ *          Y. INCY must not be zero.
+ *
+ */
+void CORE_zgemv( cham_trans_t trans, int M, int N,
+                 CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                              const CHAMELEON_Complex64_t *x, int incX,
+                 CHAMELEON_Complex64_t beta,        CHAMELEON_Complex64_t *y, int incY )
+{
+    cblas_zgemv(
+        CblasColMajor, (CBLAS_TRANSPOSE)trans, M, N,
+        CBLAS_SADDR(alpha), A, LDA,
+                            x, incX,
+        CBLAS_SADDR(beta),  y, incY );
+}
diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c
index 377c47253..f27cebfd2 100644
--- a/coreblas/compute/core_ztile.c
+++ b/coreblas/compute/core_ztile.c
@@ -83,6 +83,19 @@ TCORE_zgelqt( int                    M,
     return CORE_zgelqt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK );
 }
 
+void
+TCORE_zgemv( cham_trans_t trans, int M, int N,
+             CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A,
+                                          const CHAM_tile_t *x, int incX,
+             CHAMELEON_Complex64_t beta,        CHAM_tile_t *y, int incY )
+{
+    assert( A->format & CHAMELEON_TILE_FULLRANK );
+    assert( x->format & CHAMELEON_TILE_FULLRANK );
+    assert( y->format & CHAMELEON_TILE_FULLRANK );
+    CORE_zgemv(
+        trans, M, N, alpha, A->mat, A->ld, x->mat, incX, beta, y->mat, incY );
+}
+
 void
 TCORE_zgemm( cham_trans_t          transA,
              cham_trans_t          transB,
diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h
index 806337af2..9f317d4e5 100644
--- a/coreblas/include/coreblas/coreblas_z.h
+++ b/coreblas/include/coreblas/coreblas_z.h
@@ -55,6 +55,10 @@ void CORE_zgemm(cham_trans_t transA, cham_trans_t transB,
                 CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
                                           const CHAMELEON_Complex64_t *B, int LDB,
                 CHAMELEON_Complex64_t beta,        CHAMELEON_Complex64_t *C, int LDC);
+void CORE_zgemv(cham_trans_t trans, int M, int N,
+                CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                             const CHAMELEON_Complex64_t *x, int incX,
+                CHAMELEON_Complex64_t beta,        CHAMELEON_Complex64_t *y, int incY);
 int  CORE_zgeqrt(int M, int N, int IB,
                  CHAMELEON_Complex64_t *A, int LDA,
                  CHAMELEON_Complex64_t *T, int LDT,
diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h
index ccb6b83b2..c754393a0 100644
--- a/coreblas/include/coreblas/coreblas_ztile.h
+++ b/coreblas/include/coreblas/coreblas_ztile.h
@@ -23,6 +23,7 @@ void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, int M, int N, const CH
 int  TCORE_zaxpy( int M, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, int incA, CHAM_tile_t *B, int incB );
 int  TCORE_zgeadd( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *B );
 int  TCORE_zgelqt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK );
+void TCORE_zgemv( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *x, int incx, CHAMELEON_Complex64_t beta, CHAM_tile_t *y, int incy );
 void TCORE_zgemm( cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C );
 int  TCORE_zgeqrt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK );
 int  TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A );
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 3cb7c5bc2..a78d30041 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -53,6 +53,11 @@ void INSERT_TASK_zgelqt( const RUNTIME_option_t *options,
                          int m, int n, int ib, int nb,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *T, int Tm, int Tn );
+void INSERT_TASK_zgemv( const RUNTIME_option_t *options,
+                        cham_trans_t trans, int m, int n,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *X, int Xm, int Xn, int incX,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *Y, int Ym, int Yn, int incY );
 void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                         cham_trans_t transA, cham_trans_t transB,
                         int m, int n, int k, int nb,
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 76a0bce72..1caa32fef 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -35,6 +35,10 @@ set(CODELETS_ZSRC
     ##################
     codelets/codelet_zaxpy.c
     ##################
+    # BLAS 2
+    ##################
+    codelets/codelet_zgemv.c
+    ##################
     # BLAS 3
     ##################
     codelets/codelet_zgemm.c
diff --git a/runtime/openmp/codelets/codelet_zgemv.c b/runtime/openmp/codelets/codelet_zgemv.c
new file mode 100644
index 000000000..c3d70b0b4
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zgemv.c
@@ -0,0 +1,37 @@
+/**
+ *
+ * @file openmp/codelet_zgemv.c
+ *
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgemv OpenMP codelet
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2020-10-06
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+void
+INSERT_TASK_zgemv( const RUNTIME_option_t *options,
+                   cham_trans_t trans, int m, int n,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *X, int Xm, int Xn, int incX,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *Y, int Ym, int Yn, int incY )
+{
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileX = X->get_blktile( X, Xm, Xn );
+    CHAM_tile_t *tileY = Y->get_blktile( Y, Ym, Yn );
+
+#pragma omp task firstprivate( trans, m, n, alpha, tileA, tileX, incX, beta, tileY, incY ) depend( in:tileA[0], tileX[0] ) depend( inout:tileY[0] )
+    TCORE_zgemv( trans,  m, n,
+                 alpha, tileA, tileX, incX,
+                 beta,  tileY, incY );
+}
diff --git a/runtime/parsec/codelets/codelet_zgemv.c b/runtime/parsec/codelets/codelet_zgemv.c
new file mode 100644
index 000000000..0579743d5
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zgemv.c
@@ -0,0 +1,76 @@
+/**
+ *
+ * @file parsec/codelet_zgemv.c
+ *
+ * @copyright 2009-2015 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgemv PaRSEC codelet
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2020-10-06
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_z.h"
+
+static inline int
+CORE_zgemv_parsec( parsec_execution_stream_t *context,
+                   parsec_task_t             *this_task )
+{
+    cham_trans_t trans;
+    int m;
+    int n;
+    CHAMELEON_Complex64_t alpha;
+    CHAMELEON_Complex64_t *A;
+    int lda;
+    CHAMELEON_Complex64_t *X;
+    int incX;
+    CHAMELEON_Complex64_t beta;
+    CHAMELEON_Complex64_t *Y;
+    int incY;
+
+    parsec_dtd_unpack_args(
+        this_task, &trans, &m, &n, &alpha, &A, &lda, &X, &incX, &beta, &Y, &incY );
+
+    CORE_zgemv( trans, m, n,
+                alpha, A, lda,
+                       X, incX,
+                beta,  Y, incY );
+
+    (void)context;
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+void
+INSERT_TASK_zgemv( const RUNTIME_option_t *options,
+                   cham_trans_t trans, int m, int n,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *X, int Xm, int Xn, int incX,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *Y, int Ym, int Yn, int incY )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zgemv_parsec, options->priority, "zgemv",
+        sizeof(cham_trans_t),          &trans,       VALUE,
+        sizeof(int),                   &m,           VALUE,
+        sizeof(int),                   &n,           VALUE,
+        sizeof(CHAMELEON_Complex64_t), &alpha,       VALUE,
+        PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),                   &(tileA->ld), VALUE,
+        PASSED_BY_REF, RTBLKADDR( X, CHAMELEON_Complex64_t, Xm, Xn ), chameleon_parsec_get_arena_index( X ) | INPUT,
+        sizeof(int),                   &incX,        VALUE,
+        sizeof(CHAMELEON_Complex64_t), &beta,        VALUE,
+        PASSED_BY_REF, RTBLKADDR( Y, CHAMELEON_Complex64_t, Ym, Yn ), chameleon_parsec_get_arena_index( Y ) | INOUT | AFFINITY,
+        sizeof(int),                   &incY,        VALUE,
+        PARSEC_DTD_ARG_END );
+}
diff --git a/runtime/quark/codelets/codelet_zgemv.c b/runtime/quark/codelets/codelet_zgemv.c
new file mode 100644
index 000000000..4dcde9bf5
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zgemv.c
@@ -0,0 +1,66 @@
+/**
+ *
+ * @file quark/codelet_zgemv.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgemv Quark codelet
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2020-10-06
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+void CORE_zgemv_quark(Quark *quark)
+{
+    cham_trans_t trans;
+    int m;
+    int n;
+    CHAMELEON_Complex64_t alpha;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileX;
+    int incX;
+    CHAMELEON_Complex64_t beta;
+    CHAM_tile_t *tileY;
+    int incY;
+
+    quark_unpack_args_10( quark, trans, m, n, alpha, tileA, tileX, incX, beta, tileY, incY );
+    TCORE_zgemv( trans, m, n,
+                 alpha, tileA, tileX, incX,
+                 beta,  tileY, incY );
+}
+
+void INSERT_TASK_zgemv( const RUNTIME_option_t *options,
+                        cham_trans_t trans, int m, int n,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *X, int Xm, int Xn, int incX,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *Y, int Ym, int Yn, int incY )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessY = ( beta == 0. ) ? OUTPUT : INOUT;
+
+    /* DAG_CORE_GEMV; */
+    QUARK_Insert_Task(
+        opt->quark, CORE_zgemv_quark, (Quark_Task_Flags*)opt,
+        sizeof(cham_trans_t),          &trans, VALUE,
+        sizeof(int),                   &m,     VALUE,
+        sizeof(int),                   &n,     VALUE,
+        sizeof(CHAMELEON_Complex64_t), &alpha, VALUE,
+        sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
+        sizeof(void*), RTBLKADDR(X, CHAMELEON_Complex64_t, Xm, Xn), INPUT,
+        sizeof(int),                   &incX,  VALUE,
+        sizeof(CHAMELEON_Complex64_t), &beta,  VALUE,
+        sizeof(void*), RTBLKADDR(Y, CHAMELEON_Complex64_t, Ym, Yn), accessY,
+        sizeof(int),                   &incY,  VALUE,
+        0);
+}
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index ef24f40dc..cb381cfc1 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -30,6 +30,7 @@ CHAMELEON_CL_CB(zaxpy,         cti_handle_get_m(task->handles[0]), cti_handle_ge
 CHAMELEON_CL_CB(zgeadd,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zlascal,       cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zgelqt,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      (4./3.)*M*N*K)
+CHAMELEON_CL_CB(zgemv,         cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      2. *M*N  )
 CHAMELEON_CL_CB(zgemm,         cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]),     2. *M*N*K) /* If A^t, computation is wrong */
 CHAMELEON_CL_CB(zgeqrt,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      (4./3.)*M*M*N)
 CHAMELEON_CL_CB(zgessm,        cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]),     2. *M*N*K)
diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c
new file mode 100644
index 000000000..62b887247
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgemv.c
@@ -0,0 +1,133 @@
+/**
+ *
+ * @file starpu/codelet_zgemv.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgemv StarPU codelet
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2020-10-06
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgemv_cpu_func(void *descr[], void *cl_arg)
+{
+    cham_trans_t trans;
+    int m;
+    int n;
+    CHAMELEON_Complex64_t alpha;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileX;
+    int incX;
+    CHAMELEON_Complex64_t beta;
+    CHAM_tile_t *tileY;
+    int incY;
+
+    tileA = cti_interface_get(descr[0]);
+    tileX = cti_interface_get(descr[1]);
+    tileY = cti_interface_get(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &trans, &m, &n, &alpha, &incX, &beta, &incY );
+    TCORE_zgemv( trans, m, n,
+                 alpha, tileA, tileX, incX,
+                 beta,  tileY, incY );
+}
+
+#if defined(CHAMELEON_USE_CUDA) & 0
+static void cl_zgemv_cuda_func(void *descr[], void *cl_arg)
+{
+    cham_trans_t transA;
+    cham_trans_t transB;
+    int m;
+    int n;
+    int k;
+    cuDoubleComplex alpha;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    cuDoubleComplex beta;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta);
+
+    RUNTIME_getStream( stream );
+
+    CUDA_zgemv(
+        transA, transB,
+        m, n, k,
+        &alpha, tileA->mat, tileA->ld,
+                tileB->mat, tileB->ld,
+        &beta,  tileC->mat, tileC->ld,
+        stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgemv, cl_zgemv_cpu_func)
+
+/**
+ *
+ * @ingroup INSERT_TASK_Complex64_t
+ *
+ */
+void INSERT_TASK_zgemv( const RUNTIME_option_t *options,
+                        cham_trans_t trans, int m, int n,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *X, int Xm, int Xn, int incX,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *Y, int Ym, int Yn, int incY )
+{
+    struct starpu_codelet *codelet = &cl_zgemv;
+    void (*callback)(void*) = options->profiling ? cl_zgemv_callback : NULL;
+    starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt);
+    int workerid = (schedopt == NULL) ? -1 : schedopt->workerid;
+    int accessY = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R(A, Am, An);
+    CHAMELEON_ACCESS_R(X, Xm, Xn);
+    CHAMELEON_ACCESS_RW(Y, Ym, Yn);
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &trans,             sizeof(cham_trans_t),
+        STARPU_VALUE,    &m,                 sizeof(int),
+        STARPU_VALUE,    &n,                 sizeof(int),
+        STARPU_VALUE,    &alpha,             sizeof(CHAMELEON_Complex64_t),
+        STARPU_R,         RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R,         RTBLKADDR(X, CHAMELEON_Complex64_t, Xm, Xn),
+        STARPU_VALUE,    &incX,              sizeof(int),
+        STARPU_VALUE,    &beta,              sizeof(CHAMELEON_Complex64_t),
+        accessY,          RTBLKADDR(Y, CHAMELEON_Complex64_t, Ym, Yn),
+        STARPU_VALUE,    &incY,              sizeof(int),
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
+        STARPU_EXECUTE_ON_WORKER, workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "zgemv",
+#endif
+        0);
+}
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 95bc484a4..e5ed6704c 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -39,6 +39,11 @@
  */
 CODELETS_HEADER(zaxpy);
 
+/*
+ * BLAS 2 functions
+ */
+CODELETS_HEADER(zgemv);
+
 /*
  * BLAS 3 functions
  */
-- 
GitLab