From 22869cafe224b2ea75a63cc543f36d292bc388e9 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Fri, 16 Dec 2016 00:06:47 +0100 Subject: [PATCH] Add codelets in all three runtimes --- coreblas/compute/core_ztpmqrt.c | 2 +- runtime/CMakeLists.txt | 78 +++++++++ runtime/parsec/CMakeLists.txt | 72 +------- runtime/parsec/codelets/codelet_ztpmqrt.c | 102 +++++++++++ runtime/parsec/codelets/codelet_ztpqrt.c | 85 ++++++++++ runtime/quark/CMakeLists.txt | 72 +------- runtime/quark/codelets/codelet_ztpmqrt.c | 84 +++++++++ runtime/quark/codelets/codelet_ztpqrt.c | 72 ++++++++ runtime/starpu/CMakeLists.txt | 72 +------- runtime/starpu/codelets/codelet_zcallback.c | 4 +- runtime/starpu/codelets/codelet_ztpmqrt.c | 159 ++++++++++++++++++ runtime/starpu/codelets/codelet_ztpqrt.c | 99 +++++++++++ .../starpu/include/runtime_codelet_profile.h | 2 +- runtime/starpu/include/runtime_codelet_z.h | 2 + runtime/starpu/include/runtime_codelets.h | 2 +- runtime/starpu/include/runtime_workspace.h | 8 +- 16 files changed, 694 insertions(+), 221 deletions(-) create mode 100644 runtime/parsec/codelets/codelet_ztpmqrt.c create mode 100644 runtime/parsec/codelets/codelet_ztpqrt.c create mode 100644 runtime/quark/codelets/codelet_ztpmqrt.c create mode 100644 runtime/quark/codelets/codelet_ztpqrt.c create mode 100644 runtime/starpu/codelets/codelet_ztpmqrt.c create mode 100644 runtime/starpu/codelets/codelet_ztpqrt.c diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c index ee44b8cf8..2241b5d39 100644 --- a/coreblas/compute/core_ztpmqrt.c +++ b/coreblas/compute/core_ztpmqrt.c @@ -98,7 +98,7 @@ * * @param[out] WORK * Workspace array of size LDWORK-by-NB. - * LDWORK = N if side =MorseLeft, or M if side = MorseRight. + * LDWORK = N if side = MorseLeft, or M if side = MorseRight. * ******************************************************************************* * diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index b0d76eeba..0224e3644 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -26,6 +26,84 @@ # ### +# List of codelets required by all runtimes +# ----------------------------------------- +set(CODELETS_ZSRC + codelets/codelet_ztile_zero.c + codelets/codelet_zasum.c + ################## + # BLAS 1 + ################## + codelets/codelet_zaxpy.c + ################## + # BLAS 3 + ################## + codelets/codelet_zgemm.c + codelets/codelet_zhemm.c + codelets/codelet_zher2k.c + codelets/codelet_zherk.c + codelets/codelet_zsymm.c + codelets/codelet_zsyr2k.c + codelets/codelet_zsyrk.c + codelets/codelet_ztrmm.c + codelets/codelet_ztrsm.c + ################## + # LAPACK + ################## + codelets/codelet_zgeadd.c + codelets/codelet_zlascal.c + codelets/codelet_zgelqt.c + codelets/codelet_zgeqrt.c + codelets/codelet_zgessm.c + codelets/codelet_zgessq.c + codelets/codelet_zgetrf.c + codelets/codelet_zgetrf_incpiv.c + codelets/codelet_zgetrf_nopiv.c + codelets/codelet_zhe2ge.c + codelets/codelet_zherfb.c + codelets/codelet_zhessq.c + codelets/codelet_zlacpy.c + codelets/codelet_zlange.c + codelets/codelet_zlanhe.c + codelets/codelet_zlansy.c + codelets/codelet_zlantr.c + codelets/codelet_zlaset2.c + codelets/codelet_zlaset.c + codelets/codelet_zlatro.c + codelets/codelet_zlauum.c + codelets/codelet_zplghe.c + codelets/codelet_zplgsy.c + codelets/codelet_zplrnt.c + codelets/codelet_zplssq.c + codelets/codelet_zpotrf.c + codelets/codelet_zssssm.c + codelets/codelet_zsyssq.c + codelets/codelet_zsytrf_nopiv.c + codelets/codelet_ztpqrt.c + codelets/codelet_ztpmqrt.c + codelets/codelet_ztradd.c + codelets/codelet_ztrasm.c + codelets/codelet_ztrssq.c + codelets/codelet_ztrtri.c + codelets/codelet_ztslqt.c + codelets/codelet_ztsmlq.c + codelets/codelet_ztsmqr.c + codelets/codelet_ztsmlq_hetra1.c + codelets/codelet_ztsmqr_hetra1.c + codelets/codelet_ztsqrt.c + codelets/codelet_ztstrf.c + codelets/codelet_zttlqt.c + codelets/codelet_zttmlq.c + codelets/codelet_zttmqr.c + codelets/codelet_zttqrt.c + codelets/codelet_zunmlq.c + codelets/codelet_zunmqr.c + ################## + # BUILD + ################## + codelets/codelet_zbuild.c + ) + # Check for the subdirectories # ---------------------------- if( CHAMELEON_SCHED_QUARK ) diff --git a/runtime/parsec/CMakeLists.txt b/runtime/parsec/CMakeLists.txt index a19890afe..872c19c48 100644 --- a/runtime/parsec/CMakeLists.txt +++ b/runtime/parsec/CMakeLists.txt @@ -88,77 +88,7 @@ set(RUNTIME_COMMON # ------------------------------------------------------ set(RUNTIME_SRCS_GENERATED "") set(ZSRC - codelets/codelet_ztile_zero.c - codelets/codelet_zasum.c - ################## - # BLAS 1 - ################## - codelets/codelet_zaxpy.c - ################## - # BLAS 3 - ################## - codelets/codelet_zgemm.c - codelets/codelet_zhemm.c - codelets/codelet_zher2k.c - codelets/codelet_zherk.c - codelets/codelet_zsymm.c - codelets/codelet_zsyr2k.c - codelets/codelet_zsyrk.c - codelets/codelet_ztrmm.c - codelets/codelet_ztrsm.c - ################## - # LAPACK - ################## - codelets/codelet_zgeadd.c - codelets/codelet_zlascal.c - codelets/codelet_zgelqt.c - codelets/codelet_zgeqrt.c - codelets/codelet_zgessm.c - codelets/codelet_zgessq.c - codelets/codelet_zgetrf.c - codelets/codelet_zgetrf_incpiv.c - codelets/codelet_zgetrf_nopiv.c - codelets/codelet_zhe2ge.c - codelets/codelet_zherfb.c - codelets/codelet_zhessq.c - codelets/codelet_zlacpy.c - codelets/codelet_zlange.c - codelets/codelet_zlanhe.c - codelets/codelet_zlansy.c - codelets/codelet_zlantr.c - codelets/codelet_zlaset2.c - codelets/codelet_zlaset.c - codelets/codelet_zlatro.c - codelets/codelet_zlauum.c - codelets/codelet_zplghe.c - codelets/codelet_zplgsy.c - codelets/codelet_zplrnt.c - codelets/codelet_zplssq.c - codelets/codelet_zpotrf.c - codelets/codelet_zssssm.c - codelets/codelet_zsyssq.c - codelets/codelet_zsytrf_nopiv.c - codelets/codelet_ztradd.c - codelets/codelet_ztrasm.c - codelets/codelet_ztrssq.c - codelets/codelet_ztrtri.c - codelets/codelet_ztslqt.c - codelets/codelet_ztsmlq.c - codelets/codelet_ztsmqr.c - codelets/codelet_ztsmlq_hetra1.c - codelets/codelet_ztsmqr_hetra1.c - codelets/codelet_ztsqrt.c - codelets/codelet_ztstrf.c - codelets/codelet_zttlqt.c - codelets/codelet_zttmlq.c - codelets/codelet_zttmqr.c - codelets/codelet_zttqrt.c - codelets/codelet_zunmlq.c - codelets/codelet_zunmqr.c - ################## - # BUILD - ################## - codelets/codelet_zbuild.c + ${CODELETS_ZSRC} ) precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" diff --git a/runtime/parsec/codelets/codelet_ztpmqrt.c b/runtime/parsec/codelets/codelet_ztpmqrt.c new file mode 100644 index 000000000..612e9d54a --- /dev/null +++ b/runtime/parsec/codelets/codelet_ztpmqrt.c @@ -0,0 +1,102 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/parsec/include/morse_parsec.h" + +static int +CORE_ztpmqrt_parsec(dague_execution_unit_t *context, + dague_execution_context_t *this_task) +{ + MORSE_enum *side; + MORSE_enum *trans; + int *M; + int *N; + int *K; + int *L; + int *ib; + const MORSE_Complex64_t *V; + int *ldv; + const MORSE_Complex64_t *T; + int *ldt; + MORSE_Complex64_t *A; + int *lda; + MORSE_Complex64_t *B; + int *ldb; + MORSE_Complex64_t *WORK; + + dague_dtd_unpack_args( + this_task, + UNPACK_VALUE, &side, + UNPACK_VALUE, &trans, + UNPACK_VALUE, &M, + UNPACK_VALUE, &N, + UNPACK_VALUE, &K, + UNPACK_VALUE, &L, + UNPACK_VALUE, &ib, + UNPACK_DATA, &V, + UNPACK_VALUE, &ldv, + UNPACK_DATA, &T, + UNPACK_VALUE, &ldt, + UNPACK_DATA, &A, + UNPACK_VALUE, &lda, + UNPACK_DATA, &B, + UNPACK_VALUE, &ldb, + UNPACK_SCRATCH, &WORK ); + + CORE_ztpmqrt( *side, *trans, *M, *N, *K, *L, *ib, + V, *ldv, T, *ldt, A, *lda, B, *ldb, WORK ); + + return 0; +} + +void MORSE_TASK_ztpmqrt( const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int M, int N, int K, int L, int ib, int nb, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb ) +{ + dague_dtd_handle_t* DAGUE_dtd_handle = (dague_dtd_handle_t *)(options->sequence->schedopt); + + dague_insert_task( + DAGUE_dtd_handle, CORE_ztpmqrt_parsec, "tpmqrt", + sizeof(MORSE_enum), &side, VALUE, + sizeof(MORSE_enum), &trans, VALUE, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, + sizeof(int), &K, VALUE, + sizeof(int), &L, VALUE, + sizeof(int), &ib, VALUE, + PASSED_BY_REF, RTBLKADDR( V, MORSE_Complex64_t, Vm, Vn ), INPUT | REGION_FULL, + sizeof(int), &ldv, VALUE, + PASSED_BY_REF, RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INPUT | REGION_FULL, + sizeof(int), &ldt, VALUE, + PASSED_BY_REF, RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | REGION_FULL, + sizeof(int), &lda, VALUE, + PASSED_BY_REF, RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT | REGION_FULL, + sizeof(int), &ldb, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH, + 0); +} diff --git a/runtime/parsec/codelets/codelet_ztpqrt.c b/runtime/parsec/codelets/codelet_ztpqrt.c new file mode 100644 index 000000000..a0b3f6e06 --- /dev/null +++ b/runtime/parsec/codelets/codelet_ztpqrt.c @@ -0,0 +1,85 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/parsec/include/morse_parsec.h" + +static int +CORE_ztpqrt_parsec(dague_execution_unit_t *context, + dague_execution_context_t *this_task) +{ + int *M; + int *N; + int *L; + int *ib; + MORSE_Complex64_t *A; + int *lda; + MORSE_Complex64_t *B; + int *ldb; + MORSE_Complex64_t *T; + int *ldt; + MORSE_Complex64_t *WORK; + + dague_dtd_unpack_args( + this_task, + UNPACK_VALUE, &M, + UNPACK_VALUE, &N, + UNPACK_VALUE, &L, + UNPACK_VALUE, &ib, + UNPACK_DATA, &A, + UNPACK_VALUE, &lda, + UNPACK_DATA, &B, + UNPACK_VALUE, &ldb, + UNPACK_DATA, &T, + UNPACK_VALUE, &ldt, + UNPACK_SCRATCH, &WORK ); + + CORE_ztpqrt( *M, *N, *L, *ib, + A, *lda, B, *ldb, T, *ldt, WORK ); + + return 0; +} + +void MORSE_TASK_ztpqrt( const MORSE_option_t *options, + int M, int N, int L, int ib, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb, + const MORSE_desc_t *T, int Tm, int Tn, int ldt ) +{ + dague_dtd_handle_t* DAGUE_dtd_handle = (dague_dtd_handle_t *)(options->sequence->schedopt); + + dague_insert_task( + DAGUE_dtd_handle, CORE_ztpqrt_parsec, "tpqrt", + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, + sizeof(int), &L, VALUE, + sizeof(int), &ib, VALUE, + PASSED_BY_REF, RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | REGION_U | REGION_D, + sizeof(int), &lda, VALUE, + PASSED_BY_REF, RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT | REGION_FULL, + sizeof(int), &ldb, VALUE, + PASSED_BY_REF, RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INOUT | REGION_FULL, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH, + 0); +} diff --git a/runtime/quark/CMakeLists.txt b/runtime/quark/CMakeLists.txt index 9366a00d7..fa7952a15 100644 --- a/runtime/quark/CMakeLists.txt +++ b/runtime/quark/CMakeLists.txt @@ -86,77 +86,7 @@ set(RUNTIME_COMMON # ------------------------------------------------------ set(RUNTIME_SRCS_GENERATED "") set(ZSRC - codelets/codelet_ztile_zero.c - codelets/codelet_zasum.c - ################## - # BLAS 1 - ################## - codelets/codelet_zaxpy.c - ################## - # BLAS 3 - ################## - codelets/codelet_zgemm.c - codelets/codelet_zhemm.c - codelets/codelet_zher2k.c - codelets/codelet_zherk.c - codelets/codelet_zsymm.c - codelets/codelet_zsyr2k.c - codelets/codelet_zsyrk.c - codelets/codelet_ztrmm.c - codelets/codelet_ztrsm.c - ################## - # LAPACK - ################## - codelets/codelet_zgeadd.c - codelets/codelet_zlascal.c - codelets/codelet_zgelqt.c - codelets/codelet_zgeqrt.c - codelets/codelet_zgessm.c - codelets/codelet_zgessq.c - codelets/codelet_zgetrf.c - codelets/codelet_zgetrf_incpiv.c - codelets/codelet_zgetrf_nopiv.c - codelets/codelet_zhe2ge.c - codelets/codelet_zherfb.c - codelets/codelet_zhessq.c - codelets/codelet_zlacpy.c - codelets/codelet_zlange.c - codelets/codelet_zlanhe.c - codelets/codelet_zlansy.c - codelets/codelet_zlantr.c - codelets/codelet_zlaset2.c - codelets/codelet_zlaset.c - codelets/codelet_zlatro.c - codelets/codelet_zlauum.c - codelets/codelet_zplghe.c - codelets/codelet_zplgsy.c - codelets/codelet_zplrnt.c - codelets/codelet_zplssq.c - codelets/codelet_zpotrf.c - codelets/codelet_zssssm.c - codelets/codelet_zsyssq.c - codelets/codelet_zsytrf_nopiv.c - codelets/codelet_ztradd.c - codelets/codelet_ztrasm.c - codelets/codelet_ztrssq.c - codelets/codelet_ztrtri.c - codelets/codelet_ztslqt.c - codelets/codelet_ztsmlq.c - codelets/codelet_ztsmqr.c - codelets/codelet_ztsmlq_hetra1.c - codelets/codelet_ztsmqr_hetra1.c - codelets/codelet_ztsqrt.c - codelets/codelet_ztstrf.c - codelets/codelet_zttlqt.c - codelets/codelet_zttmlq.c - codelets/codelet_zttmqr.c - codelets/codelet_zttqrt.c - codelets/codelet_zunmlq.c - codelets/codelet_zunmqr.c - ################## - # BUILD - ################## - codelets/codelet_zbuild.c + ${CODELETS_ZSRC} ) precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" diff --git a/runtime/quark/codelets/codelet_ztpmqrt.c b/runtime/quark/codelets/codelet_ztpmqrt.c new file mode 100644 index 000000000..25bd5ac83 --- /dev/null +++ b/runtime/quark/codelets/codelet_ztpmqrt.c @@ -0,0 +1,84 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/quark/include/morse_quark.h" + +static void +CORE_ztpmqrt_quark( Quark *quark ) +{ + MORSE_enum side; + MORSE_enum trans; + int M; + int N; + int K; + int L; + int ib; + const MORSE_Complex64_t *V; + int ldv; + const MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *A; + int lda; + MORSE_Complex64_t *B; + int ldb; + MORSE_Complex64_t *WORK; + + quark_unpack_args_16( quark, side, trans, M, N, K, L, ib, + V, ldv, T, ldt, A, lda, B, ldb, WORK ); + + CORE_ztpmqrt( side, trans, M, N, K, L, ib, + V, ldv, T, ldt, A, lda, B, ldb, WORK ); +} + +void MORSE_TASK_ztpmqrt( const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int M, int N, int K, int L, int ib, int nb, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_TSMQR; + + QUARK_Insert_Task( + opt->quark, CORE_ztpmqrt_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &side, VALUE, + sizeof(MORSE_enum), &trans, VALUE, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, + sizeof(int), &K, VALUE, + sizeof(int), &L, VALUE, + sizeof(int), &ib, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR( V, MORSE_Complex64_t, Vm, Vn ), INPUT, + sizeof(int), &ldv, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INPUT, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT, + sizeof(int), &lda, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT, + sizeof(int), &ldb, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH, + 0); +} diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c new file mode 100644 index 000000000..9b7e09876 --- /dev/null +++ b/runtime/quark/codelets/codelet_ztpqrt.c @@ -0,0 +1,72 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/quark/include/morse_quark.h" + +static void +CORE_ztpqrt_quark( Quark *quark ) +{ + int M; + int N; + int L; + int ib; + MORSE_Complex64_t *A; + int lda; + MORSE_Complex64_t *B; + int ldb; + MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *WORK; + + quark_unpack_args_11( quark, M, N, L, ib, + A, lda, B, ldb, T, ldt, WORK ); + + CORE_ztpqrt( M, N, L, ib, + A, lda, B, ldb, T, ldt, WORK ); +} + +void MORSE_TASK_ztpqrt( const MORSE_option_t *options, + int M, int N, int L, int ib, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb, + const MORSE_desc_t *T, int Tm, int Tn, int ldt ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_TSQRT; + + QUARK_Insert_Task( + opt->quark, CORE_ztpqrt_quark, (Quark_Task_Flags*)opt, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, + sizeof(int), &L, VALUE, + sizeof(int), &ib, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | QUARK_REGION_U | QUARK_REGION_D, + sizeof(int), &lda, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT, + sizeof(int), &ldb, VALUE, + sizeof(MORSE_Complex64_t)*nb*ib, RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), OUTPUT, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*(ib+1)*nb, NULL, SCRATCH, + 0); +} diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 08956acaf..b2748379d 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -106,77 +106,7 @@ set_source_files_properties(control/runtime_profiling.c PROPERTIES COMPILE_FLAGS set(RUNTIME_SRCS_GENERATED "") set(ZSRC codelets/codelet_zcallback.c - codelets/codelet_ztile_zero.c - codelets/codelet_zasum.c - ################## - # BLAS 1 - ################## - codelets/codelet_zaxpy.c - ################## - # BLAS 3 - ################## - codelets/codelet_zgemm.c - codelets/codelet_zhemm.c - codelets/codelet_zher2k.c - codelets/codelet_zherk.c - codelets/codelet_zsymm.c - codelets/codelet_zsyr2k.c - codelets/codelet_zsyrk.c - codelets/codelet_ztrmm.c - codelets/codelet_ztrsm.c - ################## - # LAPACK - ################## - codelets/codelet_zgeadd.c - codelets/codelet_zlascal.c - codelets/codelet_zgelqt.c - codelets/codelet_zgeqrt.c - codelets/codelet_zgessm.c - codelets/codelet_zgessq.c - codelets/codelet_zgetrf.c - codelets/codelet_zgetrf_incpiv.c - codelets/codelet_zgetrf_nopiv.c - codelets/codelet_zhe2ge.c - codelets/codelet_zherfb.c - codelets/codelet_zhessq.c - codelets/codelet_zlacpy.c - codelets/codelet_zlange.c - codelets/codelet_zlanhe.c - codelets/codelet_zlansy.c - codelets/codelet_zlantr.c - codelets/codelet_zlaset2.c - codelets/codelet_zlaset.c - codelets/codelet_zlatro.c - codelets/codelet_zlauum.c - codelets/codelet_zplghe.c - codelets/codelet_zplgsy.c - codelets/codelet_zplrnt.c - codelets/codelet_zplssq.c - codelets/codelet_zpotrf.c - codelets/codelet_zssssm.c - codelets/codelet_zsyssq.c - codelets/codelet_zsytrf_nopiv.c - codelets/codelet_ztradd.c - codelets/codelet_ztrasm.c - codelets/codelet_ztrssq.c - codelets/codelet_ztrtri.c - codelets/codelet_ztslqt.c - codelets/codelet_ztsmlq.c - codelets/codelet_ztsmqr.c - codelets/codelet_ztsmlq_hetra1.c - codelets/codelet_ztsmqr_hetra1.c - codelets/codelet_ztsqrt.c - codelets/codelet_ztstrf.c - codelets/codelet_zttlqt.c - codelets/codelet_zttmlq.c - codelets/codelet_zttmqr.c - codelets/codelet_zttqrt.c - codelets/codelet_zunmlq.c - codelets/codelet_zunmqr.c - ################## - # BUILD - ################## - codelets/codelet_zbuild.c + ${CODELETS_ZSRC} ) precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 8af4ec546..bb26aa301 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -67,7 +67,9 @@ CHAMELEON_CL_CB(zssssm, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(zsymm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0, 2.*M*M *N); CHAMELEON_CL_CB(zsyr2k, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+2.*M*N)*M); CHAMELEON_CL_CB(zsyrk, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+ M)*M*N); -CHAMELEON_CL_CB(ztrasm, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 0.5*M*(M+1)); +CHAMELEON_CL_CB(ztpqrt, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]), 2.*M*N*K); +CHAMELEON_CL_CB(ztpmqrt, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]), 4.*M*N*K); +CHAMELEON_CL_CB(ztrasm, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 0.5*M*(M+1)); CHAMELEON_CL_CB(ztrmm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N); CHAMELEON_CL_CB(ztrsm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N); CHAMELEON_CL_CB(ztrtri, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M *M*M); diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c new file mode 100644 index 000000000..98188588e --- /dev/null +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -0,0 +1,159 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpmqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void MORSE_TASK_ztpmqrt( const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int M, int N, int K, int L, int ib, int nb, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb ) +{ + struct starpu_codelet *codelet = &cl_ztpmqrt; + void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL; + + if ( morse_desc_islocal( A, Am, An ) || + morse_desc_islocal( B, Bm, Bn ) || + morse_desc_islocal( V, Vm, Vn ) || + morse_desc_islocal( T, Tm, Tn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &side, sizeof(MORSE_enum), + STARPU_VALUE, &trans, sizeof(MORSE_enum), + STARPU_VALUE, &M, sizeof(int), + STARPU_VALUE, &N, sizeof(int), + STARPU_VALUE, &K, sizeof(int), + STARPU_VALUE, &L, sizeof(int), + STARPU_R, RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn), + STARPU_VALUE, &ldv, sizeof(int), + STARPU_R, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), + STARPU_VALUE, &ldt, sizeof(int), + STARPU_RW, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_RW, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + /* Other options */ + STARPU_SCRATCH, options->ws_worker, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_USE_MPI) + STARPU_EXECUTE_ON_NODE, execution_rank, +#endif +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "ztpmqrt", +#endif + 0); + } +} + + +#if !defined(CHAMELEON_SIMULATION) +static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int M; + int N; + int K; + int L; + int ib; + const MORSE_Complex64_t *V; + int ldv; + const MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *A; + int lda; + MORSE_Complex64_t *B; + int ldb; + MORSE_Complex64_t *WORK; + + V = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + B = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); + WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ + + starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, + &ldv, &ldt, &lda, &ldb ); + + CORE_ztpmqrt( side, trans, M, N, K, L, ib, + V, ldv, T, ldt, A, lda, B, ldb, WORK ); +} + + +#if defined(CHAMELEON_USE_CUDA) +static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int M; + int N; + int K; + int L; + int k; + int ib; + const cuDoubleComplex *V; + int ldv; + const cuDoubleComplex *T; + int ldt; + cuDoubleComplex *A; + int lda; + cuDoubleComplex *B; + int ldb; + cuDoubleComplex *W; + CUstream stream; + + V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); + W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ + + starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, + &ldv, &ldt, &lda, &ldb ); + + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + + CUDA_ztpmqrt( + side, trans, M, N, K, L, ib, + A, lda, B, ldb, V, ldv, T, ldt, + W, stream ); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + + +/* + * Codelet definition + */ +CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c new file mode 100644 index 000000000..b6da13320 --- /dev/null +++ b/runtime/starpu/codelets/codelet_ztpqrt.c @@ -0,0 +1,99 @@ +/** + * + * @copyright (c) 2009-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztpqrt.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2016-12-15 + * @precisions normal z -> s d c + * + **/ +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void MORSE_TASK_ztpqrt( const MORSE_option_t *options, + int M, int N, int L, int ib, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb, + const MORSE_desc_t *T, int Tm, int Tn, int ldt ) +{ + struct starpu_codelet *codelet = &cl_ztpqrt; + void (*callback)(void*) = options->profiling ? cl_ztpqrt_callback : NULL; + + if ( morse_desc_islocal( A, Am, An ) || + morse_desc_islocal( B, Bm, Bn ) || + morse_desc_islocal( T, Tm, Tn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &M, sizeof(int), + STARPU_VALUE, &N, sizeof(int), + STARPU_VALUE, &L, sizeof(int), + STARPU_RW, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_RW, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + STARPU_RW, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), + STARPU_VALUE, &ldt, sizeof(int), + /* Other options */ + STARPU_SCRATCH, options->ws_worker, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_USE_MPI) + STARPU_EXECUTE_ON_NODE, execution_rank, +#endif +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "ztpqrt", +#endif + 0); + } +} + + +#if !defined(CHAMELEON_SIMULATION) +static void cl_ztpqrt_cpu_func(void *descr[], void *cl_arg) +{ + int M; + int N; + int L; + int ib; + MORSE_Complex64_t *A; + int lda; + MORSE_Complex64_t *B; + int ldb; + MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *WORK; + + A = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + T = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + + starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib, + &lda, &ldb, &ldt ); + + CORE_ztpqrt( M, N, L, ib, + A, lda, B, ldb, T, ldt, WORK ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + + +/* + * Codelet definition + */ +CODELETS_CPU(ztpqrt, 4, cl_ztpqrt_cpu_func) diff --git a/runtime/starpu/include/runtime_codelet_profile.h b/runtime/starpu/include/runtime_codelet_profile.h index 99303041a..67942fc01 100644 --- a/runtime/starpu/include/runtime_codelet_profile.h +++ b/runtime/starpu/include/runtime_codelet_profile.h @@ -119,6 +119,6 @@ extern struct starpu_perfmodel cl_##name##_fake; \ void cl_##name##_callback(); \ void profiling_display_##name##_info(void); \ - void estimate_##name##_sustained_peak(double *res); + void estimate_##name##_sustained_peak(double *res) #endif /* __CODELET_PROFILE_H__ */ diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 0da29addb..16de7ea01 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -73,6 +73,8 @@ ZCODELETS_HEADER(syssq) ZCODELETS_HEADER(trasm) ZCODELETS_HEADER(trssq) ZCODELETS_HEADER(trtri) +ZCODELETS_HEADER(tpqrt) +ZCODELETS_HEADER(tpmqrt) ZCODELETS_HEADER(tslqt) ZCODELETS_HEADER(tsmlq) ZCODELETS_HEADER(tsmqr) diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index cf0a3bb31..14b1c8e56 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -87,7 +87,7 @@ #define CODELETS_ALL_HEADER(name) \ - CHAMELEON_CL_CB_HEADER(name) \ + CHAMELEON_CL_CB_HEADER(name); \ void cl_##name##_load_fake_model(void); \ void cl_##name##_restore_model(void); \ extern struct starpu_codelet cl_##name; \ diff --git a/runtime/starpu/include/runtime_workspace.h b/runtime/starpu/include/runtime_workspace.h index e1bd1859d..a7d25d38e 100644 --- a/runtime/starpu/include/runtime_workspace.h +++ b/runtime/starpu/include/runtime_workspace.h @@ -26,10 +26,10 @@ #ifndef _MORSE_STARPU_WORKSPACE_H_ #define _MORSE_STARPU_WORKSPACE_H_ -/* - * Allocate workspace in host memory: CPU for any worker +/* + * Allocate workspace in host memory: CPU for any worker * or allocate workspace in worker's memory: main memory for cpu workers, - * and embedded memory for CUDA devices. + * and embedded memory for CUDA devices. */ #define MORSE_HOST_MEM 0 #define MORSE_WORKER_MEM 1 @@ -48,7 +48,7 @@ typedef struct morse_starpu_ws_s MORSE_starpu_ws_t; * (eg. MORSE_CUDA|MORSE_CPU for all CPU and GPU workers). The * memory_location argument indicates whether this should be a buffer in host * memory or in worker's memory (MORSE_HOST_MEM or MORSE_WORKER_MEM). This function - * returns 0 upon successful completion. + * returns 0 upon successful completion. */ int RUNTIME_starpu_ws_alloc ( MORSE_starpu_ws_t **workspace, size_t size, int which_workers, int memory_location); int RUNTIME_starpu_ws_free ( MORSE_starpu_ws_t *workspace); -- GitLab