From c7306d21e8af71af79d9d4c847ca78e4041ce677 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Mon, 29 Jun 2020 18:26:27 +0200 Subject: [PATCH] Reduce the spectrum of the data accesses with Quark --- runtime/quark/codelets/codelet_zaxpy.c | 4 ++ runtime/quark/codelets/codelet_zgeadd.c | 62 +++------------------ runtime/quark/codelets/codelet_zgemm.c | 16 ++++-- runtime/quark/codelets/codelet_zhe2ge.c | 16 ++---- runtime/quark/codelets/codelet_zhemm.c | 10 +++- runtime/quark/codelets/codelet_zher2k.c | 22 +++++--- runtime/quark/codelets/codelet_zherk.c | 9 ++- runtime/quark/codelets/codelet_zlascal.c | 10 +++- runtime/quark/codelets/codelet_zsymm.c | 9 ++- runtime/quark/codelets/codelet_zsyr2k.c | 11 +++- runtime/quark/codelets/codelet_zsyrk.c | 9 ++- runtime/quark/codelets/codelet_ztradd.c | 68 +++-------------------- runtime/quark/codelets/codelet_ztrmm.c | 15 +++-- runtime/starpu/codelets/codelet_zgeadd.c | 55 ------------------ runtime/starpu/codelets/codelet_zgemm.c | 2 - runtime/starpu/codelets/codelet_zhe2ge.c | 5 -- runtime/starpu/codelets/codelet_zhemm.c | 2 - runtime/starpu/codelets/codelet_zher2k.c | 2 - runtime/starpu/codelets/codelet_zherk.c | 7 --- runtime/starpu/codelets/codelet_zlascal.c | 2 - runtime/starpu/codelets/codelet_zsymm.c | 2 - runtime/starpu/codelets/codelet_zsyr2k.c | 2 - runtime/starpu/codelets/codelet_zsyrk.c | 2 - runtime/starpu/codelets/codelet_ztradd.c | 61 -------------------- runtime/starpu/codelets/codelet_ztrmm.c | 2 - 25 files changed, 110 insertions(+), 295 deletions(-) diff --git a/runtime/quark/codelets/codelet_zaxpy.c b/runtime/quark/codelets/codelet_zaxpy.c index 04cc7fad9..bb3357b8c 100644 --- a/runtime/quark/codelets/codelet_zaxpy.c +++ b/runtime/quark/codelets/codelet_zaxpy.c @@ -39,6 +39,10 @@ void INSERT_TASK_zaxpy(const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int incA, const CHAM_desc_t *B, int Bm, int Bn, int incB) { + if ( alpha == 0. ) { + return; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_AXPY; QUARK_Insert_Task(opt->quark, CORE_zaxpy_quark, (Quark_Task_Flags*)opt, diff --git a/runtime/quark/codelets/codelet_zgeadd.c b/runtime/quark/codelets/codelet_zgeadd.c index d95e44381..6fa2cfb2b 100644 --- a/runtime/quark/codelets/codelet_zgeadd.c +++ b/runtime/quark/codelets/codelet_zgeadd.c @@ -38,65 +38,19 @@ void CORE_zgeadd_quark(Quark *quark) return; } -/** - ****************************************************************************** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Adds two general matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M - * otherwise. - * - * @param[in] LDA - * Leading dimension of the array A. LDA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size LDB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] LDB - * Leading dimension of the array B. LDB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, B, Bm, Bn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessB = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_GEADD; QUARK_Insert_Task(opt->quark, CORE_zgeadd_quark, (Quark_Task_Flags*)opt, sizeof(int), &trans, VALUE, @@ -105,7 +59,7 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), accessB, 0); (void)nb; diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c index 6def09b52..9b5137663 100644 --- a/runtime/quark/codelets/codelet_zgemm.c +++ b/runtime/quark/codelets/codelet_zgemm.c @@ -41,8 +41,7 @@ void CORE_zgemm_quark(Quark *quark) quark_unpack_args_10(quark, transA, transB, m, n, k, alpha, tileA, tileB, beta, tileC); TCORE_zgemm( transA, transB, m, n, k, - alpha, tileA, - tileB, + alpha, tileA, tileB, beta, tileC ); } @@ -50,10 +49,17 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_GEMM; QUARK_Insert_Task(opt->quark, CORE_zgemm_quark, (Quark_Task_Flags*)opt, sizeof(int), &transA, VALUE, @@ -65,6 +71,6 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_zhe2ge.c b/runtime/quark/codelets/codelet_zhe2ge.c index 7b4a42566..e8aefce45 100644 --- a/runtime/quark/codelets/codelet_zhe2ge.c +++ b/runtime/quark/codelets/codelet_zhe2ge.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline void CORE_zhe2ge_quark(Quark *quark) { cham_uplo_t uplo; @@ -38,12 +33,11 @@ static inline void CORE_zhe2ge_quark(Quark *quark) TCORE_zhe2ge(uplo, M, N, tileA, tileB); } - -void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, - cham_uplo_t uplo, - int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c index 5ab641222..c55fa6901 100644 --- a/runtime/quark/codelets/codelet_zhemm.c +++ b/runtime/quark/codelets/codelet_zhemm.c @@ -52,7 +52,14 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_HEMM; QUARK_Insert_Task(opt->quark, CORE_zhemm_quark, (Quark_Task_Flags*)opt, sizeof(int), &side, VALUE, @@ -63,7 +70,6 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } - diff --git a/runtime/quark/codelets/codelet_zher2k.c b/runtime/quark/codelets/codelet_zher2k.c index bd6437c53..05b46cd1f 100644 --- a/runtime/quark/codelets/codelet_zher2k.c +++ b/runtime/quark/codelets/codelet_zher2k.c @@ -42,14 +42,22 @@ void CORE_zher2k_quark(Quark *quark) n, k, alpha, tileA, tileB, beta, tileC); } -void INSERT_TASK_zher2k(const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - double beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zher2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_HER2K; QUARK_Insert_Task(opt->quark, CORE_zher2k_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, @@ -60,6 +68,6 @@ void INSERT_TASK_zher2k(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(double), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_zherk.c b/runtime/quark/codelets/codelet_zherk.c index 3d47a8e59..7d11dfb52 100644 --- a/runtime/quark/codelets/codelet_zherk.c +++ b/runtime/quark/codelets/codelet_zherk.c @@ -49,7 +49,14 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options, double alpha, const CHAM_desc_t *A, int Am, int An, double beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_HERK; QUARK_Insert_Task(opt->quark, CORE_zherk_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, @@ -59,6 +66,6 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options, sizeof(double), &alpha, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_zlascal.c b/runtime/quark/codelets/codelet_zlascal.c index 716c85c6b..67cdcb149 100644 --- a/runtime/quark/codelets/codelet_zlascal.c +++ b/runtime/quark/codelets/codelet_zlascal.c @@ -43,6 +43,14 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An) { + if ( alpha == 0. ) { + return INSERT_TASK_zlaset( options, uplo, m, n, + alpha, alpha, A, Am, An ); + } + else if ( alpha == 1. ) { + return; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LASCAL; QUARK_Insert_Task(opt->quark, CORE_zlascal_quark, (Quark_Task_Flags*)opt, @@ -53,5 +61,3 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, 0); } - - diff --git a/runtime/quark/codelets/codelet_zsymm.c b/runtime/quark/codelets/codelet_zsymm.c index 6bccc1dee..71658b68c 100644 --- a/runtime/quark/codelets/codelet_zsymm.c +++ b/runtime/quark/codelets/codelet_zsymm.c @@ -52,7 +52,14 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_SYMM; QUARK_Insert_Task(opt->quark, CORE_zsymm_quark, (Quark_Task_Flags*)opt, sizeof(int), &side, VALUE, @@ -63,6 +70,6 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_zsyr2k.c b/runtime/quark/codelets/codelet_zsyr2k.c index 0e41e44fa..d172bc359 100644 --- a/runtime/quark/codelets/codelet_zsyr2k.c +++ b/runtime/quark/codelets/codelet_zsyr2k.c @@ -39,7 +39,7 @@ void CORE_zsyr2k_quark(Quark *quark) quark_unpack_args_9(quark, uplo, trans, n, k, alpha, tileA, tileB, beta, tileC); TCORE_zsyr2k(uplo, trans, - n, k, alpha, tileA, tileB, beta, tileC); + n, k, alpha, tileA, tileB, beta, tileC); } void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, @@ -49,7 +49,14 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_SYR2K; QUARK_Insert_Task(opt->quark, CORE_zsyr2k_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, @@ -60,6 +67,6 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_zsyrk.c b/runtime/quark/codelets/codelet_zsyrk.c index d8c272f50..b58c022d3 100644 --- a/runtime/quark/codelets/codelet_zsyrk.c +++ b/runtime/quark/codelets/codelet_zsyrk.c @@ -49,7 +49,14 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessC = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_SYRK; QUARK_Insert_Task(opt->quark, CORE_zsyrk_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, @@ -59,6 +66,6 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } diff --git a/runtime/quark/codelets/codelet_ztradd.c b/runtime/quark/codelets/codelet_ztradd.c index f3a9e0d24..6532b1600 100644 --- a/runtime/quark/codelets/codelet_ztradd.c +++ b/runtime/quark/codelets/codelet_ztradd.c @@ -37,71 +37,19 @@ void CORE_ztradd_quark(Quark *quark) return; } -/** - ****************************************************************************** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Adds two trapezoidal matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies the shape of A and B matrices: - * = ChamUpperLower: A and B are general matrices. - * = ChamUpper: op(A) and B are upper trapezoidal matrices. - * = ChamLower: op(A) and B are lower trapezoidal matrices. - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M - * otherwise. - * - * @param[in] LDA - * Leading dimension of the array A. LDA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size LDB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] LDB - * Leading dimension of the array B. LDB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, uplo, m, n, nb, + beta, B, Bm, Bn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessB = ( beta == 0. ) ? OUTPUT : INOUT; + DAG_CORE_GEADD; QUARK_Insert_Task(opt->quark, CORE_ztradd_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, @@ -111,7 +59,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), accessB, 0); (void)nb; diff --git a/runtime/quark/codelets/codelet_ztrmm.c b/runtime/quark/codelets/codelet_ztrmm.c index 56d6afada..df18b77be 100644 --- a/runtime/quark/codelets/codelet_ztrmm.c +++ b/runtime/quark/codelets/codelet_ztrmm.c @@ -45,12 +45,17 @@ void CORE_ztrmm_quark(Quark *quark) tileB); } -void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { + if ( alpha == 0. ) { + return INSERT_TASK_zlaset( options, ChamUpperLower, m, n, + alpha, alpha, B, Bm, Bn ); + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TRMM; QUARK_Insert_Task(opt->quark, CORE_ztrmm_quark, (Quark_Task_Flags*)opt, diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index bd027eff0..65c82231e 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -12,8 +12,6 @@ * @brief Chameleon zgeadd StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -86,59 +84,6 @@ CODELETS(zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) CODELETS_CPU(zgeadd, cl_zgeadd_cpu_func) #endif -/** - ****************************************************************************** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Adds two general matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size ldA-by-N, if trans = ChamNoTrans, ldA-by-M - * otherwise. - * - * @param[in] ldA - * Leading dimension of the array A. ldA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size ldB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] ldB - * Leading dimension of the array B. ldB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 54e6256b5..42bd6609a 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -12,8 +12,6 @@ * @brief Chameleon zgemm StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c index fe1f9eb29..203544170 100644 --- a/runtime/starpu/codelets/codelet_zhe2ge.c +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -44,11 +44,6 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zhe2ge, cl_zhe2ge_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 5c90271ec..f4963cacf 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -12,8 +12,6 @@ * @brief Chameleon zhemm StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 0e93a35c9..e652db505 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -12,8 +12,6 @@ * @brief Chameleon zher2k StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 915cc9b77..ec0f985b5 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -12,8 +12,6 @@ * @brief Chameleon zherk StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -88,11 +86,6 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) */ CODELETS(zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ void INSERT_TASK_zherk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index d1bfc3fd3..0142c39ec 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -12,8 +12,6 @@ * @brief Chameleon zlascal StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Dalal Sukkari * @author Lucas Barros de Assis * @date 2020-03-03 diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 40ed44bcb..b87b3bef6 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -12,8 +12,6 @@ * @brief Chameleon zsymm StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 51f013036..822094558 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -12,8 +12,6 @@ * @brief Chameleon zsyr2k StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 83c51f599..9795d0a98 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -12,8 +12,6 @@ * @brief Chameleon zsyrk StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index ac3dc8bfa..fbd6a0f8e 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -12,8 +12,6 @@ * @brief Chameleon ztradd StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Lucas Barros de Assis * @date 2020-03-03 @@ -48,65 +46,6 @@ static void cl_ztradd_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(ztradd, cl_ztradd_cpu_func) -/** - ****************************************************************************** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Adds two trapezoidal matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies the shape of A and B matrices: - * = ChamUpperLower: A and B are general matrices. - * = ChamUpper: op(A) and B are upper trapezoidal matrices. - * = ChamLower: op(A) and B are lower trapezoidal matrices. - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size ldA-by-N, if trans = ChamNoTrans, ldA-by-M - * otherwise. - * - * @param[in] ldA - * Leading dimension of the array A. ldA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size ldB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] ldB - * Leading dimension of the array B. ldB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index d1404ba96..e820a6d6b 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrmm StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge -- GitLab