From c7306d21e8af71af79d9d4c847ca78e4041ce677 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Mon, 29 Jun 2020 18:26:27 +0200
Subject: [PATCH] Reduce the spectrum of the data accesses with Quark

---
 runtime/quark/codelets/codelet_zaxpy.c    |  4 ++
 runtime/quark/codelets/codelet_zgeadd.c   | 62 +++------------------
 runtime/quark/codelets/codelet_zgemm.c    | 16 ++++--
 runtime/quark/codelets/codelet_zhe2ge.c   | 16 ++----
 runtime/quark/codelets/codelet_zhemm.c    | 10 +++-
 runtime/quark/codelets/codelet_zher2k.c   | 22 +++++---
 runtime/quark/codelets/codelet_zherk.c    |  9 ++-
 runtime/quark/codelets/codelet_zlascal.c  | 10 +++-
 runtime/quark/codelets/codelet_zsymm.c    |  9 ++-
 runtime/quark/codelets/codelet_zsyr2k.c   | 11 +++-
 runtime/quark/codelets/codelet_zsyrk.c    |  9 ++-
 runtime/quark/codelets/codelet_ztradd.c   | 68 +++--------------------
 runtime/quark/codelets/codelet_ztrmm.c    | 15 +++--
 runtime/starpu/codelets/codelet_zgeadd.c  | 55 ------------------
 runtime/starpu/codelets/codelet_zgemm.c   |  2 -
 runtime/starpu/codelets/codelet_zhe2ge.c  |  5 --
 runtime/starpu/codelets/codelet_zhemm.c   |  2 -
 runtime/starpu/codelets/codelet_zher2k.c  |  2 -
 runtime/starpu/codelets/codelet_zherk.c   |  7 ---
 runtime/starpu/codelets/codelet_zlascal.c |  2 -
 runtime/starpu/codelets/codelet_zsymm.c   |  2 -
 runtime/starpu/codelets/codelet_zsyr2k.c  |  2 -
 runtime/starpu/codelets/codelet_zsyrk.c   |  2 -
 runtime/starpu/codelets/codelet_ztradd.c  | 61 --------------------
 runtime/starpu/codelets/codelet_ztrmm.c   |  2 -
 25 files changed, 110 insertions(+), 295 deletions(-)

diff --git a/runtime/quark/codelets/codelet_zaxpy.c b/runtime/quark/codelets/codelet_zaxpy.c
index 04cc7fad9..bb3357b8c 100644
--- a/runtime/quark/codelets/codelet_zaxpy.c
+++ b/runtime/quark/codelets/codelet_zaxpy.c
@@ -39,6 +39,10 @@ void INSERT_TASK_zaxpy(const RUNTIME_option_t *options,
                       const CHAM_desc_t *A, int Am, int An, int incA,
                       const CHAM_desc_t *B, int Bm, int Bn, int incB)
 {
+    if ( alpha == 0. ) {
+        return;
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_AXPY;
     QUARK_Insert_Task(opt->quark, CORE_zaxpy_quark, (Quark_Task_Flags*)opt,
diff --git a/runtime/quark/codelets/codelet_zgeadd.c b/runtime/quark/codelets/codelet_zgeadd.c
index d95e44381..6fa2cfb2b 100644
--- a/runtime/quark/codelets/codelet_zgeadd.c
+++ b/runtime/quark/codelets/codelet_zgeadd.c
@@ -38,65 +38,19 @@ void CORE_zgeadd_quark(Quark *quark)
     return;
 }
 
-/**
- ******************************************************************************
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Adds two general matrices together as in PBLAS pzgeadd.
- *
- *       B <- alpha * op(A)  + beta * B,
- *
- * where op(X) = X, X', or conj(X')
- *
- *******************************************************************************
- *
- * @param[in] trans
- *          Specifies whether the matrix A is non-transposed, transposed, or
- *          conjugate transposed
- *          = ChamNoTrans:   op(A) = A
- *          = ChamTrans:     op(A) = A'
- *          = ChamConjTrans: op(A) = conj(A')
- *
- * @param[in] M
- *          Number of rows of the matrices op(A) and B.
- *
- * @param[in] N
- *          Number of columns of the matrices op(A) and B.
- *
- * @param[in] alpha
- *          Scalar factor of A.
- *
- * @param[in] A
- *          Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M
- *          otherwise.
- *
- * @param[in] LDA
- *          Leading dimension of the array A. LDA >= max(1,k), with k=M, if
- *          trans = ChamNoTrans, and k=N otherwise.
- *
- * @param[in] beta
- *          Scalar factor of B.
- *
- * @param[in,out] B
- *          Matrix of size LDB-by-N.
- *          On exit, B = alpha * op(A) + beta * B
- *
- * @param[in] LDB
- *          Leading dimension of the array B. LDB >= max(1,M)
- *
- *******************************************************************************
- *
- * @retval CHAMELEON_SUCCESS successful exit
- * @retval <0 if -i, the i-th argument had an illegal value
- *
- */
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
                          cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn )
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, B, Bm, Bn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessB = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_GEADD;
     QUARK_Insert_Task(opt->quark, CORE_zgeadd_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                 &trans, VALUE,
@@ -105,7 +59,7 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
         sizeof(CHAMELEON_Complex64_t),         &alpha, VALUE,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),             INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,   VALUE,
-        sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             INOUT,
+        sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             accessB,
         0);
 
     (void)nb;
diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c
index 6def09b52..9b5137663 100644
--- a/runtime/quark/codelets/codelet_zgemm.c
+++ b/runtime/quark/codelets/codelet_zgemm.c
@@ -41,8 +41,7 @@ void CORE_zgemm_quark(Quark *quark)
     quark_unpack_args_10(quark, transA, transB, m, n, k, alpha, tileA, tileB, beta, tileC);
     TCORE_zgemm( transA, transB,
                  m, n, k,
-                 alpha, tileA,
-                        tileB,
+                 alpha, tileA, tileB,
                  beta,  tileC );
 }
 
@@ -50,10 +49,17 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
                       cham_trans_t transA, cham_trans_t transB,
                       int m, int n, int k, int nb,
                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+                                                   const CHAM_desc_t *B, int Bm, int Bn,
+                      CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_GEMM;
     QUARK_Insert_Task(opt->quark, CORE_zgemm_quark, (Quark_Task_Flags*)opt,
                       sizeof(int),                &transA,    VALUE,
@@ -65,6 +71,6 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
                       sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),                 INPUT,
                       sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),                 INPUT,
                       sizeof(CHAMELEON_Complex64_t),         &beta,      VALUE,
-                      sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 INOUT,
+                      sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
                       0);
 }
diff --git a/runtime/quark/codelets/codelet_zhe2ge.c b/runtime/quark/codelets/codelet_zhe2ge.c
index 7b4a42566..e8aefce45 100644
--- a/runtime/quark/codelets/codelet_zhe2ge.c
+++ b/runtime/quark/codelets/codelet_zhe2ge.c
@@ -21,11 +21,6 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_ztile.h"
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
 static inline void CORE_zhe2ge_quark(Quark *quark)
 {
     cham_uplo_t uplo;
@@ -38,12 +33,11 @@ static inline void CORE_zhe2ge_quark(Quark *quark)
     TCORE_zhe2ge(uplo, M, N, tileA, tileB);
 }
 
-
-void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options,
-                       cham_uplo_t uplo,
-                       int m, int n, int mb,
-                       const CHAM_desc_t *A, int Am, int An,
-                       const CHAM_desc_t *B, int Bm, int Bn)
+void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo,
+                         int m, int n, int mb,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_LACPY;
diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c
index 5ab641222..c55fa6901 100644
--- a/runtime/quark/codelets/codelet_zhemm.c
+++ b/runtime/quark/codelets/codelet_zhemm.c
@@ -52,7 +52,14 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
                       const CHAM_desc_t *B, int Bm, int Bn,
                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_HEMM;
     QUARK_Insert_Task(opt->quark, CORE_zhemm_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &side,    VALUE,
@@ -63,7 +70,6 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),               INPUT,
         sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),               INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,    VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               accessC,
         0);
 }
-
diff --git a/runtime/quark/codelets/codelet_zher2k.c b/runtime/quark/codelets/codelet_zher2k.c
index bd6437c53..05b46cd1f 100644
--- a/runtime/quark/codelets/codelet_zher2k.c
+++ b/runtime/quark/codelets/codelet_zher2k.c
@@ -42,14 +42,22 @@ void CORE_zher2k_quark(Quark *quark)
                 n, k, alpha, tileA, tileB, beta, tileC);
 }
 
-void INSERT_TASK_zher2k(const RUNTIME_option_t *options,
-                       cham_uplo_t uplo, cham_trans_t trans,
-                       int n, int k, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                       const CHAM_desc_t *B, int Bm, int Bn,
-                       double beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zher2k( const RUNTIME_option_t *options,
+                    cham_uplo_t uplo, cham_trans_t trans,
+                    int n, int k, int nb,
+                    CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                 const CHAM_desc_t *B, int Bm, int Bn,
+                    double beta,                 const CHAM_desc_t *C, int Cm, int Cn )
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_HER2K;
     QUARK_Insert_Task(opt->quark, CORE_zher2k_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &uplo,      VALUE,
@@ -60,6 +68,6 @@ void INSERT_TASK_zher2k(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),                 INPUT,
         sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),                 INPUT,
         sizeof(double),                     &beta,      VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_zherk.c b/runtime/quark/codelets/codelet_zherk.c
index 3d47a8e59..7d11dfb52 100644
--- a/runtime/quark/codelets/codelet_zherk.c
+++ b/runtime/quark/codelets/codelet_zherk.c
@@ -49,7 +49,14 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options,
                       double alpha, const CHAM_desc_t *A, int Am, int An,
                       double beta, const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_HERK;
     QUARK_Insert_Task(opt->quark, CORE_zherk_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &uplo,      VALUE,
@@ -59,6 +66,6 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options,
         sizeof(double),                     &alpha,     VALUE,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),                 INPUT,
         sizeof(double),                     &beta,      VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_zlascal.c b/runtime/quark/codelets/codelet_zlascal.c
index 716c85c6b..67cdcb149 100644
--- a/runtime/quark/codelets/codelet_zlascal.c
+++ b/runtime/quark/codelets/codelet_zlascal.c
@@ -43,6 +43,14 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha,
                         const CHAM_desc_t *A, int Am, int An)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlaset( options, uplo, m, n,
+                                   alpha, alpha, A, Am, An );
+    }
+    else if ( alpha == 1. ) {
+        return;
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_LASCAL;
     QUARK_Insert_Task(opt->quark, CORE_zlascal_quark, (Quark_Task_Flags*)opt,
@@ -53,5 +61,3 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT,
         0);
 }
-
-
diff --git a/runtime/quark/codelets/codelet_zsymm.c b/runtime/quark/codelets/codelet_zsymm.c
index 6bccc1dee..71658b68c 100644
--- a/runtime/quark/codelets/codelet_zsymm.c
+++ b/runtime/quark/codelets/codelet_zsymm.c
@@ -52,7 +52,14 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
                       const CHAM_desc_t *B, int Bm, int Bn,
                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_SYMM;
     QUARK_Insert_Task(opt->quark, CORE_zsymm_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &side,    VALUE,
@@ -63,6 +70,6 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),               INPUT,
         sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),               INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,    VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               accessC,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_zsyr2k.c b/runtime/quark/codelets/codelet_zsyr2k.c
index 0e41e44fa..d172bc359 100644
--- a/runtime/quark/codelets/codelet_zsyr2k.c
+++ b/runtime/quark/codelets/codelet_zsyr2k.c
@@ -39,7 +39,7 @@ void CORE_zsyr2k_quark(Quark *quark)
 
     quark_unpack_args_9(quark, uplo, trans, n, k, alpha, tileA, tileB, beta, tileC);
     TCORE_zsyr2k(uplo, trans,
-                n, k, alpha, tileA, tileB, beta, tileC);
+                 n, k, alpha, tileA, tileB, beta, tileC);
 }
 
 void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options,
@@ -49,7 +49,14 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options,
                        const CHAM_desc_t *B, int Bm, int Bn,
                        CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_SYR2K;
     QUARK_Insert_Task(opt->quark, CORE_zsyr2k_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &uplo,      VALUE,
@@ -60,6 +67,6 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),                 INPUT,
         sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),                 INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,      VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_zsyrk.c b/runtime/quark/codelets/codelet_zsyrk.c
index d8c272f50..b58c022d3 100644
--- a/runtime/quark/codelets/codelet_zsyrk.c
+++ b/runtime/quark/codelets/codelet_zsyrk.c
@@ -49,7 +49,14 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options,
                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessC = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_SYRK;
     QUARK_Insert_Task(opt->quark, CORE_zsyrk_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                &uplo,      VALUE,
@@ -59,6 +66,6 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options,
         sizeof(CHAMELEON_Complex64_t),         &alpha,     VALUE,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),                 INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,      VALUE,
-        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 INOUT,
+        sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_ztradd.c b/runtime/quark/codelets/codelet_ztradd.c
index f3a9e0d24..6532b1600 100644
--- a/runtime/quark/codelets/codelet_ztradd.c
+++ b/runtime/quark/codelets/codelet_ztradd.c
@@ -37,71 +37,19 @@ void CORE_ztradd_quark(Quark *quark)
     return;
 }
 
-/**
- ******************************************************************************
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Adds two trapezoidal matrices together as in PBLAS pzgeadd.
- *
- *       B <- alpha * op(A)  + beta * B,
- *
- * where op(X) = X, X', or conj(X')
- *
- *******************************************************************************
- *
- * @param[in] uplo
- *          Specifies the shape of A and B matrices:
- *          = ChamUpperLower: A and B are general matrices.
- *          = ChamUpper: op(A) and B are upper trapezoidal matrices.
- *          = ChamLower: op(A) and B are lower trapezoidal matrices.
- *
- * @param[in] trans
- *          Specifies whether the matrix A is non-transposed, transposed, or
- *          conjugate transposed
- *          = ChamNoTrans:   op(A) = A
- *          = ChamTrans:     op(A) = A'
- *          = ChamConjTrans: op(A) = conj(A')
- *
- * @param[in] M
- *          Number of rows of the matrices op(A) and B.
- *
- * @param[in] N
- *          Number of columns of the matrices op(A) and B.
- *
- * @param[in] alpha
- *          Scalar factor of A.
- *
- * @param[in] A
- *          Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M
- *          otherwise.
- *
- * @param[in] LDA
- *          Leading dimension of the array A. LDA >= max(1,k), with k=M, if
- *          trans = ChamNoTrans, and k=N otherwise.
- *
- * @param[in] beta
- *          Scalar factor of B.
- *
- * @param[in,out] B
- *          Matrix of size LDB-by-N.
- *          On exit, B = alpha * op(A) + beta * B
- *
- * @param[in] LDB
- *          Leading dimension of the array B. LDB >= max(1,M)
- *
- *******************************************************************************
- *
- * @retval CHAMELEON_SUCCESS successful exit
- * @retval <0 if -i, the i-th argument had an illegal value
- *
- */
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn )
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, uplo, m, n, nb,
+                                    beta, B, Bm, Bn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    int accessB = ( beta == 0. ) ? OUTPUT : INOUT;
+
     DAG_CORE_GEADD;
     QUARK_Insert_Task(opt->quark, CORE_ztradd_quark, (Quark_Task_Flags*)opt,
         sizeof(int),                 &uplo,  VALUE,
@@ -111,7 +59,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
         sizeof(CHAMELEON_Complex64_t),         &alpha, VALUE,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),             INPUT,
         sizeof(CHAMELEON_Complex64_t),         &beta,   VALUE,
-        sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             INOUT,
+        sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             accessB,
         0);
 
     (void)nb;
diff --git a/runtime/quark/codelets/codelet_ztrmm.c b/runtime/quark/codelets/codelet_ztrmm.c
index 56d6afada..df18b77be 100644
--- a/runtime/quark/codelets/codelet_ztrmm.c
+++ b/runtime/quark/codelets/codelet_ztrmm.c
@@ -45,12 +45,17 @@ void CORE_ztrmm_quark(Quark *quark)
         tileB);
 }
 
-void INSERT_TASK_ztrmm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn)
+void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                        const CHAM_desc_t *B, int Bm, int Bn )
 {
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlaset( options, ChamUpperLower, m, n,
+                                   alpha, alpha, B, Bm, Bn );
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_TRMM;
     QUARK_Insert_Task(opt->quark, CORE_ztrmm_quark, (Quark_Task_Flags*)opt,
diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index bd027eff0..65c82231e 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zgeadd StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
@@ -86,59 +84,6 @@ CODELETS(zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC)
 CODELETS_CPU(zgeadd, cl_zgeadd_cpu_func)
 #endif
 
-/**
- ******************************************************************************
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Adds two general matrices together as in PBLAS pzgeadd.
- *
- *       B <- alpha * op(A)  + beta * B,
- *
- * where op(X) = X, X', or conj(X')
- *
- *******************************************************************************
- *
- * @param[in] trans
- *          Specifies whether the matrix A is non-transposed, transposed, or
- *          conjugate transposed
- *          = ChamNoTrans:   op(A) = A
- *          = ChamTrans:     op(A) = A'
- *          = ChamConjTrans: op(A) = conj(A')
- *
- * @param[in] M
- *          Number of rows of the matrices op(A) and B.
- *
- * @param[in] N
- *          Number of columns of the matrices op(A) and B.
- *
- * @param[in] alpha
- *          Scalar factor of A.
- *
- * @param[in] A
- *          Matrix of size ldA-by-N, if trans = ChamNoTrans, ldA-by-M
- *          otherwise.
- *
- * @param[in] ldA
- *          Leading dimension of the array A. ldA >= max(1,k), with k=M, if
- *          trans = ChamNoTrans, and k=N otherwise.
- *
- * @param[in] beta
- *          Scalar factor of B.
- *
- * @param[in,out] B
- *          Matrix of size ldB-by-N.
- *          On exit, B = alpha * op(A) + beta * B
- *
- * @param[in] ldB
- *          Leading dimension of the array B. ldB >= max(1,M)
- *
- *******************************************************************************
- *
- * @retval CHAMELEON_SUCCESS successful exit
- * @retval <0 if -i, the i-th argument had an illegal value
- *
- */
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
                          cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 54e6256b5..42bd6609a 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zgemm StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c
index fe1f9eb29..203544170 100644
--- a/runtime/starpu/codelets/codelet_zhe2ge.c
+++ b/runtime/starpu/codelets/codelet_zhe2ge.c
@@ -44,11 +44,6 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zhe2ge, cl_zhe2ge_cpu_func)
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
 void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options,
                          cham_uplo_t uplo,
                          int m, int n, int mb,
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index 5c90271ec..f4963cacf 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zhemm StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 0e93a35c9..e652db505 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zher2k StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 915cc9b77..ec0f985b5 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zherk StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
@@ -88,11 +86,6 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
  */
 CODELETS(zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC)
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
 void INSERT_TASK_zherk(const RUNTIME_option_t *options,
                       cham_uplo_t uplo, cham_trans_t trans,
                       int n, int k, int nb,
diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c
index d1bfc3fd3..0142c39ec 100644
--- a/runtime/starpu/codelets/codelet_zlascal.c
+++ b/runtime/starpu/codelets/codelet_zlascal.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zlascal StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Dalal Sukkari
  * @author Lucas Barros de Assis
  * @date 2020-03-03
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 40ed44bcb..b87b3bef6 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zsymm StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 51f013036..822094558 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zsyr2k StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index 83c51f599..9795d0a98 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zsyrk StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Hatem Ltaief
  * @author Jakub Kurzak
  * @author Mathieu Faverge
diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c
index ac3dc8bfa..fbd6a0f8e 100644
--- a/runtime/starpu/codelets/codelet_ztradd.c
+++ b/runtime/starpu/codelets/codelet_ztradd.c
@@ -12,8 +12,6 @@
  * @brief Chameleon ztradd StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Mathieu Faverge
  * @author Lucas Barros de Assis
  * @date 2020-03-03
@@ -48,65 +46,6 @@ static void cl_ztradd_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(ztradd, cl_ztradd_cpu_func)
 
-/**
- ******************************************************************************
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Adds two trapezoidal matrices together as in PBLAS pzgeadd.
- *
- *       B <- alpha * op(A)  + beta * B,
- *
- * where op(X) = X, X', or conj(X')
- *
- *******************************************************************************
- *
- * @param[in] uplo
- *          Specifies the shape of A and B matrices:
- *          = ChamUpperLower: A and B are general matrices.
- *          = ChamUpper: op(A) and B are upper trapezoidal matrices.
- *          = ChamLower: op(A) and B are lower trapezoidal matrices.
- *
- * @param[in] trans
- *          Specifies whether the matrix A is non-transposed, transposed, or
- *          conjugate transposed
- *          = ChamNoTrans:   op(A) = A
- *          = ChamTrans:     op(A) = A'
- *          = ChamConjTrans: op(A) = conj(A')
- *
- * @param[in] M
- *          Number of rows of the matrices op(A) and B.
- *
- * @param[in] N
- *          Number of columns of the matrices op(A) and B.
- *
- * @param[in] alpha
- *          Scalar factor of A.
- *
- * @param[in] A
- *          Matrix of size ldA-by-N, if trans = ChamNoTrans, ldA-by-M
- *          otherwise.
- *
- * @param[in] ldA
- *          Leading dimension of the array A. ldA >= max(1,k), with k=M, if
- *          trans = ChamNoTrans, and k=N otherwise.
- *
- * @param[in] beta
- *          Scalar factor of B.
- *
- * @param[in,out] B
- *          Matrix of size ldB-by-N.
- *          On exit, B = alpha * op(A) + beta * B
- *
- * @param[in] ldB
- *          Leading dimension of the array B. ldB >= max(1,M)
- *
- *******************************************************************************
- *
- *          @retval CHAMELEON_SUCCESS successful exit
- *          @retval <0 if -i, the i-th argument had an illegal value
- *
- */
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index d1404ba96..e820a6d6b 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -12,8 +12,6 @@
  * @brief Chameleon ztrmm StarPU codelet
  *
  * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Julien Langou
  * @author Henricus Bouwmeester
  * @author Mathieu Faverge
-- 
GitLab