diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index 80efe7dd913952d56261b1454e9cb8b850d99c63..b17e6bc32a8bd6b200e0122e4ac2131c12828cc4 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -16,7 +16,7 @@ * @author Cedric Augonnet * @author Florent Pruvost * @author Matthieu Kuhn - * @date 2023-08-31 + * @date 2024-03-11 * */ #ifndef _chameleon_tasks_h_ @@ -97,9 +97,49 @@ typedef int (*cham_unary_operator_t)( const CHAM_desc_t *desc, cham_uplo_t uplo, int m, int n, CHAM_tile_t *data, void *op_args ); +typedef int (*cham_map_cpu_fct_t)( void *args, cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *desc, CHAM_tile_t *tile, ... ); + +#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) +#include "gpucublas.h" +typedef int (*cham_map_cuda_fct_t)( cublasHandle_t handle, void *args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *desc, CHAM_tile_t *tile, ... ); +#else +typedef void *cham_map_cuda_fct_t; +#endif + +#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION) +#include "gpuhipblas.h" +typedef int (*cham_map_hip_fct_t)( hipblasHandle_t handle, void *args, + cham_uplo_t uplo, int m, int n, int ndata, + const CHAM_desc_t *desc, CHAM_tile_t *tile, ... ); +#else +typedef void *cham_map_hip_fct_t; +#endif + +/** + * @brief Structure to store the operator functions on any architecture + */ +typedef struct cham_map_operator_s { + const char *name; /**< Name of the operator to be used in debug/tracing mode */ + cham_map_cpu_fct_t cpufunc; /**< Pointer to the CPU function of the operator */ + cham_map_cuda_fct_t cudafunc; /**< Pointer to the CUDA/cuBLAS function of the operator */ + cham_map_hip_fct_t hipfunc; /**< Pointer to the HIP function of the operator */ +} cham_map_operator_t; + +/** + * @brief Structure to store the data information in the map operation + */ +typedef struct cham_map_data_s { + cham_access_t access; /**< Access type to the descriptor. Must be one of ChamR, ChamW, ChamRW. */ + const CHAM_desc_t *desc; /**< Descriptor in which the data is taken to apply the map operation. */ +} cham_map_data_t; + void INSERT_TASK_map( const RUNTIME_option_t *options, - cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, - cham_unary_operator_t op_fct, void *op_args ); + cham_uplo_t uplo, int m, int n, + int ndata, cham_map_data_t *data, + cham_map_operator_t *op_fcts, void *op_args ); void INSERT_TASK_gemm( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, diff --git a/runtime/openmp/codelets/codelet_map.c b/runtime/openmp/codelets/codelet_map.c index bea91ea906a57110d9c0dc262ccad74d813845d8..8bbe7742b10224f1428c8d76edcc040af9676d3b 100644 --- a/runtime/openmp/codelets/codelet_map.c +++ b/runtime/openmp/codelets/codelet_map.c @@ -9,43 +9,1025 @@ * * @brief Chameleon map OpenMP codelet * - * @version 1.2.0 - * @author Philippe Virouleau + * @version 1.3.0 * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2024-03-11 * */ #include "chameleon_openmp.h" -void INSERT_TASK_map( const RUNTIME_option_t *options, - cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, - cham_unary_operator_t op_fct, void *op_args ) +typedef void (*omp_map_fct_t)( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ); + +/* + * @brief Open map function type with one tile + */ +static inline void omp_map_one_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + + assert( ndata == 1 ); + assert( A->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA ); + } +} + +static inline void omp_map_one_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + + assert( ndata == 1 ); + assert( A->access == ChamW ); + +#pragma omp task depend( out:tileA[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA ); + } +} + +static inline void omp_map_one_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + + assert( ndata == 1 ); + assert( A->access == ChamRW ); + +#pragma omp task depend( in:tileA[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA ); + } +} + +/* + * @brief Open map function type with two tiles + */ +static inline void omp_map_two_R_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamR ); + assert( B->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_R_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamR ); + assert( B->access == ChamW ); + +#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_R_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamR ); + assert( B->access == ChamRW ); + +#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_W_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamW ); + assert( B->access == ChamR ); + +#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_W_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + +#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_W_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamW ); + assert( B->access == ChamRW ); + +#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_RW_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamRW ); + assert( B->access == ChamR ); + +#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_RW_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamRW ); + assert( B->access == ChamW ); + +#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +static inline void omp_map_two_RW_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + + assert( ndata == 2 ); + assert( A->access == ChamRW ); + assert( B->access == ChamRW ); + +#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB ); + } +} + +/* + * @brief Open map function type with three tiles + */ +static inline void omp_map_three_R_R_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamR ); + assert( C->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_R_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamW ); + +#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_R_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamR ); + assert( C->access == ChamRW ); + +#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_W_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_W_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_W_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_RW_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamRW ); + assert( C->access == ChamR ); + +#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_RW_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) { - CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); - switch( accessA ) { - case ChamW: -#pragma omp task depend( out: tileA[0] ) + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamRW ); + assert( C->access == ChamW ); + +#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_R_RW_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamR ); + assert( B->access == ChamRW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_R_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamR ); + assert( C->access == ChamR ); + +#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_R_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + +#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_R_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamR ); + assert( C->access == ChamRW ); + +#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_W_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_W_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_W_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_RW_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamRW ); + assert( C->access == ChamR ); + +#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_RW_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamRW ); + assert( C->access == ChamW ); + +#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_W_RW_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamW ); + assert( B->access == ChamRW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] ) { - op_fct( A, uplo, Am, An, tileA, op_args ); + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); } - break; +} + +static inline void omp_map_three_RW_R_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); - case ChamR: -#pragma omp task depend( in: tileA[0] ) + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamR ); + assert( C->access == ChamR ); + +#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] ) { - op_fct( A, uplo, Am, An, tileA, op_args ); + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); } +} + +static inline void omp_map_three_RW_R_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); - break; + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamW ); - case ChamRW: - default: -#pragma omp task depend( inout: tileA[0] ) +#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] ) { - op_fct( A, uplo, Am, An, tileA, op_args ); + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); } +} + +static inline void omp_map_three_RW_R_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamR ); + assert( C->access == ChamRW ); + +#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); } +} + +static inline void omp_map_three_RW_W_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); - (void)options; + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_RW_W_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamW ); + assert( C->access == ChamR ); + +#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_RW_W_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_RW_RW_R( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamRW ); + assert( C->access == ChamR ); + +#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_RW_RW_W( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamRW ); + assert( C->access == ChamW ); + +#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static inline void omp_map_three_RW_RW_RW( cham_uplo_t uplo, int m, int n, int ndata, + cham_map_data_t *data, + cham_map_operator_t *op_fcts, + void *op_args ) +{ + cham_map_data_t *A = data; + const CHAM_desc_t *descA = A->desc; + CHAM_tile_t *tileA = descA->get_blktile( descA, m, n ); + cham_map_data_t *B = data + 1; + const CHAM_desc_t *descB = B->desc; + CHAM_tile_t *tileB = descB->get_blktile( descB, m, n ); + cham_map_data_t *C = data + 2; + const CHAM_desc_t *descC = C->desc; + CHAM_tile_t *tileC = descC->get_blktile( descC, m, n ); + + assert( ndata == 3 ); + assert( A->access == ChamRW ); + assert( B->access == ChamRW ); + assert( C->access == ChamRW ); + +#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] ) + { + op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC ); + } +} + +static omp_map_fct_t omp_map_one[3] = { + omp_map_one_R, omp_map_one_W, omp_map_one_RW +}; + +static omp_map_fct_t omp_map_two[3][3] = { + { omp_map_two_R_R, omp_map_two_R_W, omp_map_two_R_RW }, + { omp_map_two_W_R, omp_map_two_W_W, omp_map_two_W_RW }, + { omp_map_two_RW_R, omp_map_two_RW_W, omp_map_two_RW_RW } +}; + +static omp_map_fct_t omp_map_three[3][3][3] = { + { { omp_map_three_R_R_R, omp_map_three_R_R_W, omp_map_three_R_R_RW }, + { omp_map_three_R_W_R, omp_map_three_R_W_W, omp_map_three_R_W_RW }, + { omp_map_three_R_RW_R, omp_map_three_R_RW_W, omp_map_three_R_RW_RW } }, + { { omp_map_three_W_R_R, omp_map_three_W_R_W, omp_map_three_W_R_RW }, + { omp_map_three_W_W_R, omp_map_three_W_W_W, omp_map_three_W_W_RW }, + { omp_map_three_W_RW_R, omp_map_three_W_RW_W, omp_map_three_W_RW_RW } }, + { { omp_map_three_RW_R_R, omp_map_three_RW_R_W, omp_map_three_RW_R_RW }, + { omp_map_three_RW_W_R, omp_map_three_RW_W_W, omp_map_three_RW_W_RW }, + { omp_map_three_RW_RW_R, omp_map_three_RW_RW_W, omp_map_three_RW_RW_RW } } +}; + + +void INSERT_TASK_map( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + int ndata, cham_map_data_t *data, + cham_map_operator_t *op_fcts, void *op_args ) +{ + if ( ( ndata < 0 ) || ( ndata > 3 ) ) { + fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" ); + return; + } + + switch( ndata ) { + case 1: + omp_map_one[ data[0].access - 1 ]( + uplo, m, n, ndata, data, op_fcts, op_args ); + break; + + case 2: + omp_map_two[ data[0].access - 1 ][ data[1].access - 1 ]( + uplo, m, n, ndata, data, op_fcts, op_args ); + break; + + case 3: + omp_map_three[ data[0].access - 1 ][ data[1].access - 1 ][ data[2].access - 1 ]( + uplo, m, n, ndata, data, op_fcts, op_args ); + break; + } } diff --git a/runtime/parsec/codelets/codelet_map.c b/runtime/parsec/codelets/codelet_map.c index 8016f857e1b1307e0436057371bb08e76b670a6e..12a701bf379b41e5e9e42a61b4df3153c4b3cf60 100644 --- a/runtime/parsec/codelets/codelet_map.c +++ b/runtime/parsec/codelets/codelet_map.c @@ -9,50 +9,120 @@ * * @brief Chameleon map PaRSEC codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2024-03-11 * */ #include "chameleon_parsec.h" #include "chameleon/tasks.h" +struct parsec_map_args_s { + cham_uplo_t uplo; + int m, n; + cham_map_operator_t *op_fcts; + void *op_args; + const CHAM_desc_t *desc[1]; +}; + +static inline int +CORE_map_one_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + struct parsec_map_args_s *pargs = NULL; + CHAM_tile_t *tileA; + + parsec_dtd_unpack_args( this_task, &pargs, &tileA ); + pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 1, + pargs->desc[0], tileA ); + + free( pargs ); +} + +static inline int +CORE_map_two_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + struct parsec_map_args_s *pargs = NULL; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB ); + pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 2, + pargs->desc[0], tileA, pargs->desc[1], tileB ); + + free( pargs ); +} + static inline int -CORE_map_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) +CORE_map_three_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) { - const CHAM_desc_t *desc; - cham_uplo_t uplo; - int m; - int n; - void *data; - cham_unary_operator_t op_fct; - void *op_args; - - parsec_dtd_unpack_args( - this_task, &desc, &uplo, &m, &n, &data, &op_fct, &op_args ); - op_fct( desc, uplo, m, n, data, op_args ); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; + struct parsec_map_args_s *pargs = NULL; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB, &tileC ); + pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 3, + pargs->desc[0], tileA, pargs->desc[1], tileB, + pargs->desc[2], tileC ); + + free( pargs ); } void INSERT_TASK_map( const RUNTIME_option_t *options, - cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, - cham_unary_operator_t op_fct, void *op_args ) + cham_uplo_t uplo, int m, int n, + int ndata, cham_map_data_t *data, + cham_map_operator_t *op_fcts, void *op_args ) { - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - int parsec_accessA = cham_to_parsec_access( accessA ); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_map_parsec, options->priority, "map", - sizeof(CHAM_desc_t*), &A, VALUE, - sizeof(cham_uplo_t), &uplo, VALUE, - sizeof(int), &Am, VALUE, - sizeof(int), &An, VALUE, - PASSED_BY_REF, RTBLKADDR(A, void, Am, An), chameleon_parsec_get_arena_index( A ) | parsec_accessA, - sizeof(cham_unary_operator_t), &op_fct, VALUE, - sizeof(void*), &op_args, VALUE, - PARSEC_DTD_ARG_END ); + parsec_taskpool_t *PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + struct parsec_map_args_s *pargs = NULL; + size_t pargs_size = 0; + int i; + + if ( ( ndata < 0 ) || ( ndata > 3 ) ) { + fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" ); + return; + } + + pargs_size = sizeof( struct parsec_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * ); + pargs = malloc( pargs_size ); + pargs->uplo = uplo; + pargs->m = m; + pargs->n = n; + pargs->op_fcts = op_fcts; + pargs->op_args = op_args; + for( i=0; i<ndata; i++ ) { + pargs->desc[i] = data[i].desc; + } + + switch( ndata ) { + case 1: + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_map_one_parsec, options->priority, op_fcts->name, + sizeof(struct parsec_map_args_s*), &pargs, VALUE, + PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ), + PARSEC_DTD_ARG_END ); + break; + + case 2: + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_map_two_parsec, options->priority, op_fcts->name, + sizeof(struct parsec_map_args_s*), &pargs, VALUE, + PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ), + PASSED_BY_REF, RTBLKADDR( data[1].desc, void, m, n ), chameleon_parsec_get_arena_index( data[1].desc ) | cham_to_parsec_access( data[1].access ), + PARSEC_DTD_ARG_END ); + break; + + case 3: + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_map_three_parsec, options->priority, op_fcts->name, + sizeof(struct parsec_map_args_s*), &pargs, VALUE, + PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ), + PASSED_BY_REF, RTBLKADDR( data[1].desc, void, m, n ), chameleon_parsec_get_arena_index( data[1].desc ) | cham_to_parsec_access( data[1].access ), + PASSED_BY_REF, RTBLKADDR( data[2].desc, void, m, n ), chameleon_parsec_get_arena_index( data[2].desc ) | cham_to_parsec_access( data[2].access ), + PARSEC_DTD_ARG_END ); + break; + } } diff --git a/runtime/quark/codelets/codelet_map.c b/runtime/quark/codelets/codelet_map.c index 8b8eca82b4565883d2012a9232e7236feef6b803..b7930ff7134fe38b504b9d14e522147fa52ea6cd 100644 --- a/runtime/quark/codelets/codelet_map.c +++ b/runtime/quark/codelets/codelet_map.c @@ -9,42 +9,114 @@ * * @brief Chameleon map Quark codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2024-03-11 * */ #include "chameleon_quark.h" #include "chameleon/tasks.h" -void CORE_map_quark(Quark *quark) +struct quark_map_args_s { + cham_uplo_t uplo; + int m, n; + cham_map_operator_t *op_fcts; + void *op_args; + const CHAM_desc_t *desc[1]; +}; + +void CORE_map_one_quark(Quark *quark) +{ + struct quark_map_args_s *qargs = NULL; + CHAM_tile_t *tileA; + + quark_unpack_args_2( quark, qargs, tileA ); + qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 1, + qargs->desc[0], tileA ); + + free( qargs ); +} + +void CORE_map_two_quark(Quark *quark) +{ + struct quark_map_args_s *qargs = NULL; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + quark_unpack_args_3( quark, qargs, tileA, tileB ); + qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 2, + qargs->desc[0], tileA, qargs->desc[1], tileB ); + + free( qargs ); +} + +void CORE_map_three_quark(Quark *quark) { - const CHAM_desc_t *desc; - cham_uplo_t uplo; - int m; - int n; - CHAM_tile_t *tile; - cham_unary_operator_t op_fct; - void *op_args; - - quark_unpack_args_7( quark, desc, uplo, m, n, tile, op_fct, op_args ); - op_fct( desc, uplo, m, n, tile, op_args ); + struct quark_map_args_s *qargs = NULL; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + quark_unpack_args_4( quark, qargs, tileA, tileB, tileC ); + qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 3, + qargs->desc[0], tileA, qargs->desc[1], tileB, + qargs->desc[2], tileC ); + + free( qargs ); } void INSERT_TASK_map( const RUNTIME_option_t *options, - cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, - cham_unary_operator_t op_fct, void *op_args ) + cham_uplo_t uplo, int m, int n, + int ndata, cham_map_data_t *data, + cham_map_operator_t *op_fcts, void *op_args ) { - quark_option_t *opt = (quark_option_t*)(options->schedopt); - - QUARK_Insert_Task( - opt->quark, CORE_map_quark, (Quark_Task_Flags*)opt, - sizeof(CHAM_desc_t*), &A, VALUE, - sizeof(cham_uplo_t), &uplo, VALUE, - sizeof(int), &Am, VALUE, - sizeof(int), &An, VALUE, - sizeof(void*), RTBLKADDR(A, void, Am, An), cham_to_quark_access( accessA ), - sizeof(cham_unary_operator_t), &op_fct, VALUE, - sizeof(void*), &op_args, VALUE, - 0); + struct quark_map_args_s *qargs = NULL; + quark_option_t *opt = (quark_option_t*)(options->schedopt); + size_t qargs_size = 0; + int i; + + if ( ( ndata < 0 ) || ( ndata > 3 ) ) { + fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" ); + return; + } + + qargs_size = sizeof( struct quark_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * ); + qargs = malloc( qargs_size ); + qargs->uplo = uplo; + qargs->m = m; + qargs->n = n; + qargs->op_fcts = op_fcts; + qargs->op_args = op_args; + for( i=0; i<ndata; i++ ) { + qargs->desc[i] = data[i].desc; + } + + switch( ndata ) { + case 1: + QUARK_Insert_Task( + opt->quark, CORE_map_one_quark, (Quark_Task_Flags*)opt, + sizeof(void*), &qargs, VALUE, + sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ), + 0 ); + break; + + case 2: + QUARK_Insert_Task( + opt->quark, CORE_map_two_quark, (Quark_Task_Flags*)opt, + sizeof(void*), &qargs, VALUE, + sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ), + sizeof(void*), RTBLKADDR( data[1].desc, void, m, n), cham_to_quark_access( data[1].access ), + 0 ); + break; + + case 3: + QUARK_Insert_Task( + opt->quark, CORE_map_three_quark, (Quark_Task_Flags*)opt, + sizeof(void*), &qargs, VALUE, + sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ), + sizeof(void*), RTBLKADDR( data[1].desc, void, m, n), cham_to_quark_access( data[1].access ), + sizeof(void*), RTBLKADDR( data[2].desc, void, m, n), cham_to_quark_access( data[2].access ), + 0 ); + break; + } } diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c index 6bdaed3eb2bf37abd32c63eeafbf1d2b438dac5e..2ef297b0d71e0152a5e997be70bf0880e2260046 100644 --- a/runtime/starpu/codelets/codelet_map.c +++ b/runtime/starpu/codelets/codelet_map.c @@ -12,66 +12,332 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost - * @date 2023-07-06 + * @date 2024-03-11 * */ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -CHAMELEON_CL_CB(map, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +struct cl_map_args_s { + cham_uplo_t uplo; + int m, n; + cham_map_operator_t *op_fcts; + void *op_args; + const CHAM_desc_t *desc[1]; +}; +/* + * Map with a single tile as parameter + */ +#if !defined(CHAMELEON_SIMULATION) +static void cl_map_one_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + CHAM_tile_t *tileA; + + tileA = cti_interface_get( descr[0] ); + clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1, + clargs->desc[0], tileA ); +} + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_map_one_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + + tileA = cti_interface_get( descr[0] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1, + clargs->desc[0], tileA ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_map_one_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + + tileA = cti_interface_get( descr[0] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1, + clargs->desc[0], tileA ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CHAMELEON_CL_CB( map_one, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N ) +#if defined(CHAMELEON_USE_HIP) + CODELETS_GPU( map_one, cl_map_one_cpu_func, cl_map_one_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( map_one, cl_map_one_cpu_func, cl_map_one_cuda_func, STARPU_CUDA_ASYNC ) +#endif + +/* + * Map with two tiles as parameter + */ +#if !defined(CHAMELEON_SIMULATION) +static void cl_map_two_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2, + clargs->desc[0], tileA, clargs->desc[1], tileB ); +} + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_map_two_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2, + clargs->desc[0], tileA, clargs->desc[1], tileB ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_map_two_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2, + clargs->desc[0], tileA, clargs->desc[1], tileB ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CHAMELEON_CL_CB( map_two, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N ) +#if defined(CHAMELEON_USE_HIP) + CODELETS_GPU( map_two, cl_map_two_cpu_func, cl_map_two_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( map_two, cl_map_two_cpu_func, cl_map_two_cuda_func, STARPU_CUDA_ASYNC ) +#endif + +/* + * Map with three tiles as parameter + */ #if !defined(CHAMELEON_SIMULATION) -static void cl_map_cpu_func(void *descr[], void *cl_arg) +static void cl_map_three_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + tileC = cti_interface_get( descr[2] ); + clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3, + clargs->desc[0], tileA, clargs->desc[1], tileB, + clargs->desc[2], tileC ); +} + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_map_three_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + tileC = cti_interface_get( descr[2] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3, + clargs->desc[0], tileA, clargs->desc[1], tileB, + clargs->desc[2], tileC ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_map_three_hip_func( void *descr[], void *cl_arg ) { - const CHAM_desc_t *desc; - cham_uplo_t uplo; - int m; - int n; - CHAM_tile_t *tile; - cham_unary_operator_t op_fct; - void *op_args; - - tile = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &desc, &uplo, &m, &n, &op_fct, &op_args ); - op_fct( desc, uplo, m, n, tile, op_args ); + struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + tileC = cti_interface_get( descr[2] ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3, + clargs->desc[0], tileA, clargs->desc[1], tileB, + clargs->desc[2], tileC ); } +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(map, cl_map_cpu_func) +CHAMELEON_CL_CB( map_three, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N ) +#if defined(CHAMELEON_USE_HIP) + CODELETS_GPU( map_three, cl_map_three_cpu_func, cl_map_three_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( map_three, cl_map_three_cpu_func, cl_map_three_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_map( const RUNTIME_option_t *options, - cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, - cham_unary_operator_t op_fct, void *op_args ) + cham_uplo_t uplo, int m, int n, + int ndata, cham_map_data_t *data, + cham_map_operator_t *op_fcts, void *op_args ) { + struct cl_map_args_s *clargs = NULL; + const char *cl_name = (op_fcts->name == NULL) ? "map" : op_fcts->name; + int exec = 0; + int i, readonly = 1; + size_t clargs_size = 0; + void (*callback)(void*); - struct starpu_codelet *codelet = &cl_map; - void (*callback)(void*) = options->profiling ? cl_map_callback : NULL; - char *cl_name = "map"; + if ( ( ndata < 0 ) || ( ndata > 3 ) ) { + fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" ); + return; + } CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_RW(A, Am, An); + for( i=0; i<ndata; i++ ) { + if ( data[i].access == ChamRW ) { + CHAMELEON_ACCESS_RW( data[i].desc, m, n ); + readonly = 0; + } + else if ( data[i].access == ChamW ) { + CHAMELEON_ACCESS_W( data[i].desc, m, n ); + readonly = 0; + } + else { + CHAMELEON_ACCESS_R( data[i].desc, m, n ); + } + } + exec = __chameleon_need_exec; + /* Force execution for read-only functions */ + if ( readonly && __chameleon_need_submit ) { + exec = 1; + } CHAMELEON_END_ACCESS_DECLARATION; - cl_name = chameleon_codelet_name( cl_name, 1, - A->get_blktile( A, Am, An ) ); - - rt_starpu_insert_task( - codelet, - STARPU_VALUE, &A, sizeof(CHAM_desc_t*), - STARPU_VALUE, &uplo, sizeof(cham_uplo_t), - STARPU_VALUE, &Am, sizeof(int), - STARPU_VALUE, &An, sizeof(int), - cham_to_starpu_access(accessA), RTBLKADDR(A, ChamByte, Am, An), - STARPU_VALUE, &op_fct, sizeof(cham_unary_operator_t), - STARPU_VALUE, &op_args, sizeof(void*), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, - STARPU_EXECUTE_ON_WORKER, options->workerid, + if ( exec ) { + clargs_size = sizeof( struct cl_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * ); + clargs = malloc( clargs_size ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->op_fcts = op_fcts; + clargs->op_args = op_args; + for( i=0; i<ndata; i++ ) { + clargs->desc[i] = data[i].desc; + } + } + + /* Refine name */ + for( i=0; i<ndata; i++ ) { + cl_name = chameleon_codelet_name( cl_name, 1, + (data[i].desc)->get_blktile( data[i].desc, m, n ) ); + } + + /* Insert the task */ + switch( ndata ) { + case 1: + callback = options->profiling ? cl_map_one_callback : NULL; + rt_starpu_insert_task( + &cl_map_one, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, clargs_size, + + /* Task handles */ + cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); + break; + + case 2: + callback = options->profiling ? cl_map_two_callback : NULL; + rt_starpu_insert_task( + &cl_map_two, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, clargs_size, + + /* Task handles */ + cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ), + cham_to_starpu_access( data[1].access ), RTBLKADDR( data[1].desc, ChamByte, m, n ), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); + break; + + case 3: + callback = options->profiling ? cl_map_three_callback : NULL; + rt_starpu_insert_task( + &cl_map_three, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, clargs_size, + + /* Task handles */ + cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ), + cham_to_starpu_access( data[1].access ), RTBLKADDR( data[1].desc, ChamByte, m, n ), + cham_to_starpu_access( data[2].access ), RTBLKADDR( data[2].desc, ChamByte, m, n ), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, cl_name, + STARPU_NAME, cl_name, #endif - 0); + 0 ); + break; + } }