diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 34a5bc904d05debd04053b4452611b87df3d94be..0f404e1a52472500e2ada61ac251bff26bbb7c6d 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -41,6 +41,7 @@ _extra_blas = [ ('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ), ('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ), ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), + ('', 'sgersum', 'dgersum', 'cgersum', 'zgersum' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/pzgemm.c b/compute/pzgemm.c index 66a79bcde0a26f21c9e18f5827695a7b36e4fbb6..4c41e64438be82566417209028177a56ac26950a 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -30,6 +30,148 @@ #define WA(m, n) WA, m, n #define WB(m, n) WB, m, n +/** + * Parallel tile matrix-matrix multiplication. + * Generic algorithm for any data distribution. + */ +static inline void +chameleon_pzgemm_Astat( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + RUNTIME_sequence_t *sequence = options->sequence; + int m, n, k; + int tempmm, tempnn, tempkn, tempkm; + int myrank = RUNTIME_comm_rank( chamctxt ); + int reduceC[ C->mt * C->nt ]; + + /* Set C tiles to redux mode. */ + for (n = 0; n < C->nt; n++) { + for (m = 0; m < C->mt; m++) { + reduceC[ n * C->mt + m ] = 0; + + /* The node owns the C tile. */ + if ( C->get_rankof( C(m, n) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + continue; + } + + /* + * The node owns the A tile that will define the locality of the + * computations. + */ + if ( transA == ChamNoTrans ) { + for (k = 0; k < A->nt; k++) { + if ( A->get_rankof( A(m, k) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + else { + for (k = 0; k < A->mt; k++) { + if ( A->get_rankof( A(k, m) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + } + } + + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for (m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + + /* Scale C */ + options->forcesub = 0; + INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb, + beta, C, m, n ); + options->forcesub = reduceC[ n * C->mt + m ]; + + /* + * A: ChamNoTrans / B: ChamNoTrans + */ + if (transA == ChamNoTrans) { + if (transB == ChamNoTrans) { + for (k = 0; k < A->nt; k++) { + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + + INSERT_TASK_zgemm_Astat( + options, + transA, transB, + tempmm, tempnn, tempkn, A->mb, + alpha, A(m, k), /* lda * Z */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + /* + * A: ChamNoTrans / B: Cham[Conj]Trans + */ + else { + for (k = 0; k < A->nt; k++) { + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + + INSERT_TASK_zgemm_Astat( + options, + transA, transB, + tempmm, tempnn, tempkn, A->mb, + alpha, A(m, k), /* lda * Z */ + B(n, k), /* ldb * Z */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + /* + * A: Cham[Conj]Trans / B: ChamNoTrans + */ + else { + if (transB == ChamNoTrans) { + for (k = 0; k < A->mt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + + INSERT_TASK_zgemm_Astat( + options, + transA, transB, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* lda * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + /* + * A: Cham[Conj]Trans / B: Cham[Conj]Trans + */ + else { + for (k = 0; k < A->mt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + + INSERT_TASK_zgemm_Astat( + options, + transA, transB, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* lda * X */ + B(n, k), /* ldb * Z */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + RUNTIME_zgersum_submit_tree( options, C(m, n) ); + RUNTIME_data_flush( sequence, C(m, n) ); + } + } + options->forcesub = 0; + + (void)chamctxt; +} + /** * Parallel tile matrix-matrix multiplication * SUMMA algorithm for 2D block-cyclic data distribution. @@ -284,6 +426,7 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws, { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -291,13 +434,21 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - if ( ws && ws->summa ) - { + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: + chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C, &(ws->WA), &(ws->WB), &options ); - } - else { - chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaA: + chameleon_pzgemm_Astat( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; } RUNTIME_options_finalize( &options, chamctxt ); diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c index 48617938bb3c59e4c75716e5986d888cf3aedd1d..b50edf517487e469b41f59d3113408d631cea0f6 100644 --- a/compute/pzgepdf_qdwh.c +++ b/compute/pzgepdf_qdwh.c @@ -175,7 +175,7 @@ chameleon_pzgepdf_qdwh_init( const CHAM_desc_t *U, const CHAM_desc_t *H, /* * Allocate the data descriptors for the lookahead if needed */ - *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, NULL, NULL, U ); + *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, U, U, U ); return; } diff --git a/compute/zgemm.c b/compute/zgemm.c index e3eed82ec66a299b83c74bbdc274ad867950bc2f..325f27cd90dc3cbf8dee13f26020f613e98a501a 100644 --- a/compute/zgemm.c +++ b/compute/zgemm.c @@ -103,14 +103,77 @@ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA __attribute__((unused) } options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); - options->summa = 0; + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } - if ( ((C->p > 1) || (C->q > 1)) && - (C->get_rankof == chameleon_getrankof_2d) && - (chamctxt->generic_enabled != CHAMELEON_TRUE) ) + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) { int lookahead = chamctxt->lookahead; - options->summa = 1; chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), @@ -150,7 +213,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws ) { struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; - if ( ws->summa ) { + if ( ws->alg == ChamGemmAlgSummaC ) { chameleon_desc_destroy( &(ws->WA) ); chameleon_desc_destroy( &(ws->WB) ); } diff --git a/control/compute_z.h b/control/compute_z.h index d7953e9045a3e967781e8985f2ed851f2a529eae..760127d31d5eab6d4339316c2888f36e6a535666 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -30,7 +30,7 @@ * @brief Data structure to handle the GEMM workspaces */ struct chameleon_pzgemm_s { - int summa; + cham_gemm_t alg; CHAM_desc_t WA; CHAM_desc_t WB; }; diff --git a/control/descriptor.h b/control/descriptor.h index 42669511a0afcdc6b9e4b98b7a43042e05d5d84d..47f0edbdfa270ad988609cc1baffbd542f5c4a93 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -233,9 +233,9 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n ) * CHAMELEON_ACCESS_RW(C, Cm, Cn) * CHAMELEON_END_ACCESS_DECLARATION */ -#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \ - unsigned __chameleon_need_exec = 0; \ - unsigned __chameleon_need_submit = 0; \ +#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \ + unsigned __chameleon_need_exec = 0; \ + unsigned __chameleon_need_submit = options->forcesub; \ RUNTIME_BEGIN_ACCESS_DECLARATION #define CHAMELEON_ACCESS_R(A, Am, An) do { \ diff --git a/doc/user/chapters/using.org b/doc/user/chapters/using.org index 721b99ad524b3b9443cc3b323c9ce237776b1f89..d104237740f931bdf2695fd9e78f68f5f2f9f0a3 100644 --- a/doc/user/chapters/using.org +++ b/doc/user/chapters/using.org @@ -914,6 +914,17 @@ int CHAMELEON_Get (CHAMELEON_enum param, int *value); #+end_src + * Alternatively, Chameleon can also be configured through environment variables. + * *CHAMELEON_GEMM_ALGO* give the possibility to switch among + multiple variants of the GEMM algorithms. These variants are + *GENERIC* for the generic variant that should work with any + configuration; *SUMMA_C* that works for 2D block cyclic + distribution of the matrices A, B, and C with a C stationnary + version; *SUMMA_A* and *SUMMA_B* are SUMMA variant of the + algorithm that works for any distribution with respectively + *A*, or *B that are stationnary. Note that the last two + variants are only available with the StarPU runtime backend. + **** Auxiliary routines Reports CHAMELEON version number. diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index d311bf63703e3de4ee27d2bfa15c9127a80db281..de26616f1df0e5f0c07db2e1d03550db1bd78e80 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -181,6 +181,16 @@ typedef enum chameleon_store_e { ChamEltwise = 403, /**< Element by element storage */ } cham_store_t; +/** + * @brief Chameleon GEMM-like algorithms + */ +typedef enum chameleon_gemm_e { + ChamGemmAlgAuto = -1, + ChamGemmAlgGeneric, + ChamGemmAlgSummaA, + ChamGemmAlgSummaB, + ChamGemmAlgSummaC +} cham_gemm_t; #define ChameleonTrd 1001 #define ChameleonBrd 1002 diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h index d1b0d9b4917c70d8b5fbdab4ac37c23dbc078598..30ec9a1a014a8d8f71debe4dc8065d10ee0ee5b5 100644 --- a/include/chameleon/runtime_struct.h +++ b/include/chameleon/runtime_struct.h @@ -84,6 +84,7 @@ typedef struct runtime_option_s { int parallel; /**< Enable/Disable the parallel version of submitted tasks */ int priority; /**< Define the submitted task priority */ int workerid; /**< Define the prefered worker id to perform the tasks */ + int forcesub; /**< Force task submission if true */ size_t ws_wsize; /**< Define the worker workspace size */ size_t ws_hsize; /**< Define the host workspace size for hybrid CPU/GPU kernel */ void *ws_worker; /**< Pointer to the worker workspace (structure) */ diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 9a26496f199568818f63751ef98a3c58d46b725f..ae307f760eb78d5ec4b3fec9a41db694211382be 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -65,6 +65,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); +void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, const CHAM_desc_t *A, int Am, int An, @@ -452,4 +458,8 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, const CHAM_desc_t *D, int Dm, int Dn, CHAM_desc_t *A, int Am, int An); +void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ); +void RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index bda8ef2732cdf6a7cb8b9a12128bb9175e9f584e..dec860e8e112b70e2e40798ebc4feda3e16ab46c 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -107,6 +107,10 @@ set(CODELETS_ZSRC codelets/codelet_zgesum.c codelets/codelet_zcesca.c codelets/codelet_zgram.c + ################## + # Reduction methods + ################## + codelets/codelet_zgersum.c ) set(CODELETS_SRC diff --git a/runtime/openmp/codelets/codelet_zgemm.c b/runtime/openmp/codelets/codelet_zgemm.c index b94aff972b1a2168b243445e1a2c527ee3603a4e..43b75d728a73cd2e8e905fa23e35230894acbe24 100644 --- a/runtime/openmp/codelets/codelet_zgemm.c +++ b/runtime/openmp/codelets/codelet_zgemm.c @@ -42,3 +42,16 @@ INSERT_TASK_zgemm( const RUNTIME_option_t *options, (void)options; (void)nb; } + +void +INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/openmp/codelets/codelet_zgersum.c b/runtime/openmp/codelets/codelet_zgersum.c new file mode 100644 index 0000000000000000000000000000000000000000..b320707c993a1521ef76d1219ae60c86db96113e --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgersum.c @@ -0,0 +1,41 @@ +/** + * + * @file starpu/codelet_zgersum.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgersum OpenMP codelet + * + * @version 1.2.0 + * @author Romain Peressoni + * @author Mathieu Faverge + * @date 2022-02-22 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" + +void +RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: Reductions are not available with OpenMP\n" ); + + (void)A; + (void)Am; + (void)An; +} + +void +RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c index e626e9fa5d419dd3f79630575e1d4b45fbcf70cb..541e7cb2fcff50d45999d728cdf4c423b7b82c9f 100644 --- a/runtime/openmp/control/runtime_options.c +++ b/runtime/openmp/control/runtime_options.c @@ -30,6 +30,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE; options->priority = RUNTIME_PRIORITY_MIN; options->workerid = -1; + options->forcesub = 0; options->ws_wsize = 0; options->ws_hsize = 0; options->ws_worker = NULL; diff --git a/runtime/parsec/codelets/codelet_zgemm.c b/runtime/parsec/codelets/codelet_zgemm.c index 303aa03ba02378093a7cedc5d02c5ea00eee9efe..5a0938ed97e3dc116c18007b55897ddad0253c75 100644 --- a/runtime/parsec/codelets/codelet_zgemm.c +++ b/runtime/parsec/codelets/codelet_zgemm.c @@ -89,3 +89,16 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, (void)nb; } + +void +INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/parsec/codelets/codelet_zgersum.c b/runtime/parsec/codelets/codelet_zgersum.c new file mode 100644 index 0000000000000000000000000000000000000000..fb460724d0ab51b55f1ea5833af0dd7dba5d9912 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgersum.c @@ -0,0 +1,41 @@ +/** + * + * @file starpu/codelet_zgersum.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgersum Parsec codelet + * + * @version 1.2.0 + * @author Romain Peressoni + * @author Mathieu Faverge + * @date 2022-02-22 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" + +void +RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: Reductions are not available with Parsec\n" ); + + (void)A; + (void)Am; + (void)An; +} + +void +RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/parsec/control/runtime_options.c b/runtime/parsec/control/runtime_options.c index 0fe1498b236ccfad82d75898992094b0cb1c785a..1a4e9e83a2ffef663e6eff6e61200bfb5af5110c 100644 --- a/runtime/parsec/control/runtime_options.c +++ b/runtime/parsec/control/runtime_options.c @@ -27,7 +27,8 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->profiling = CHAMELEON_STATISTICS == CHAMELEON_TRUE; options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE; options->priority = RUNTIME_PRIORITY_MIN; - options->workerid = -1; + options->workerid = -1; + options->forcesub = 0; options->ws_wsize = 0; options->ws_hsize = 0; options->ws_worker = NULL; diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c index 487fe27cc6a57fe548ed2a0b81cdddc26b3a517d..d854c638a2f01e68018ca0973e67ce8a4d0604a6 100644 --- a/runtime/quark/codelets/codelet_zgemm.c +++ b/runtime/quark/codelets/codelet_zgemm.c @@ -75,3 +75,16 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } + +void +INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/quark/codelets/codelet_zgersum.c b/runtime/quark/codelets/codelet_zgersum.c new file mode 100644 index 0000000000000000000000000000000000000000..e6accf1cf76c9b6af4e1ec918f09d8ac62e0dca5 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgersum.c @@ -0,0 +1,41 @@ +/** + * + * @file starpu/codelet_zgersum.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgersum Quark codelet + * + * @version 1.2.0 + * @author Romain Peressoni + * @author Mathieu Faverge + * @date 2022-02-22 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" + +void +RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: Reductions are not available with Quark\n" ); + + (void)A; + (void)Am; + (void)An; +} + +void +RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/quark/control/runtime_options.c b/runtime/quark/control/runtime_options.c index 4723d28e2810327016c754117ca12a9609a912e9..e0f58ee5d62fb2cbde7f8d4fb9e31f964726107b 100644 --- a/runtime/quark/control/runtime_options.c +++ b/runtime/quark/control/runtime_options.c @@ -40,6 +40,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE; options->priority = RUNTIME_PRIORITY_MIN; options->workerid = -1; + options->forcesub = 0; options->ws_wsize = 0; options->ws_hsize = 0; diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 3fb2514a9a98c7b08f37d700e87dde37170d73e2..cf460a8d1d0bb531b4b170975196ee5331589ac0 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -129,6 +129,18 @@ if ( STARPU_FOUND ) if ( HAVE_STARPU_MPI_COMM_GET_ATTR ) message("-- ${Blue}Add definition HAVE_STARPU_MPI_COMM_GET_ATTR${ColourReset}") endif() + check_c_source_compiles(" +#include <starpu.h> +int main(void) { + enum starpu_data_access_mode access = STARPU_MPI_REDUX; + return 0; +} +" + HAVE_STARPU_MPI_REDUX + ) + if ( HAVE_STARPU_MPI_REDUX ) + message("-- ${Blue}Add definition HAVE_STARPU_MPI_REDUX${ColourReset}") + endif() endif() if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION) diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 912b44f1391cebe46b3434bff9df80cdb9c799fa..bf72c91c2cd697268438ac9641f977be4b77cd51 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -98,6 +98,96 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) */ CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC ) +void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + + struct cl_zgemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zgemm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zgemm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ + rt_starpu_insert_task( + &cl_zgemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + + 0 ); + + (void)nb; +} + void INSERT_TASK_zgemm( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c new file mode 100644 index 0000000000000000000000000000000000000000..26327c982f997473de0e17885a8381a258ecaeb6 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgersum.c @@ -0,0 +1,129 @@ +/** + * + * @file starpu/codelet_zgersum.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgersum StarPU codelet + * + * @version 1.2.0 + * @author Romain Peressoni + * @author Mathieu Faverge + * @date 2022-02-22 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) +{ + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->m == tileB->m ); + assert( tileA->n == tileB->n ); + + TCORE_zgeadd( ChamNoTrans, tileA->m, tileA->n, 1., tileB, 1., tileA ); + + return; +} + +#ifdef CHAMELEON_USE_CUBLAS +static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAMELEON_Complex64_t zone = 1.; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->m == tileB->m ); + assert( tileA->n == tileB->n ); + + CUDA_zgeadd( ChamNoTrans, tileA->m, tileA->n, + &zone, tileB->mat, tileB->ld, + &zone, tileA->mat, tileA->ld, + handle ); + + return; +} +#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +#if defined(CHAMELEON_USE_CUBLAS) +CODELETS(zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC) +#else +CODELETS_CPU(zgersum_redux, cl_zgersum_redux_cpu_func) +#endif + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zgersum_init_cpu_func( void *descr[], void *cl_arg ) +{ + CHAM_tile_t *tileA; + + tileA = cti_interface_get(descr[0]); + + TCORE_zlaset( ChamUpperLower, tileA->m, tileA->n, 0., 0., tileA ); + + (void)cl_arg; +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU( zgersum_init, cl_zgersum_init_cpu_func ); + +void +RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) +{ +#if defined(HAVE_STARPU_MPI_REDUX) + starpu_data_set_reduction_methods( RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + &cl_zgersum_redux, + &cl_zgersum_init ); +#endif +} + +void +RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An ) +{ +#if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI) + starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD, + RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + options->priority + 1, + 2 /* Arbre binaire */ ); +#else + (void)options; + (void)A; + (void)Am; + (void)An; +#endif +} + +void RUNTIME_zgersum_init( void ) __attribute__( ( constructor ) ); +void RUNTIME_zgersum_init( void ) +{ + cl_zgersum_init.nbuffers = 1; + cl_zgersum_init.modes[0] = STARPU_W; + + cl_zgersum_redux.nbuffers = 2; + cl_zgersum_redux.modes[0] = STARPU_RW | STARPU_COMMUTE; + cl_zgersum_redux.modes[1] = STARPU_R; +} diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c index 0fa917acc326e69c03609d750ae15cd0675bf9ec..373a5c3bd395390d63bf5b2ffb7b08aea8d35a17 100644 --- a/runtime/starpu/control/runtime_options.c +++ b/runtime/starpu/control/runtime_options.c @@ -31,6 +31,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE; options->priority = RUNTIME_PRIORITY_MIN; options->workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + options->forcesub = 0; options->ws_wsize = 0; options->ws_hsize = 0; options->ws_worker = NULL; diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index c0c71d25965e13c2f98bbf8d50624b5488058af9..9b7d355b9e66dbc1dde3458ea351412177e00f67 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -45,6 +45,7 @@ #cmakedefine HAVE_STARPU_MPI_WAIT_FOR_ALL #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER +#cmakedefine HAVE_STARPU_MPI_REDUX #if (!defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER) && !defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER)) && defined(CHAMELEON_USE_MPI_DATATYPES) #error "This version of StarPU does not support MPI datatypes (Please compile with -DCHAMELEON_USE_MPI_DATATYPES=OFF)"