diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index b7281f13566b0c9a4ed274d7a9be762a5c0f3cc9..39e741a66ba4d6b10582acfdaf1fad8aa85f34c7 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -11,11 +11,11 @@ * * @brief Chameleon elementary tasks main header * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * */ #ifndef _chameleon_tasks_h_ @@ -100,6 +100,13 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, cham_unary_operator_t op_fct, void *op_args ); +void INSERT_TASK_hgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ); + #include "chameleon/tasks_z.h" #include "chameleon/tasks_d.h" #include "chameleon/tasks_c.h" diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 3e818990583537b7daf58f2d0208174ac9238523..c6e600f17bdd10e26f41fbb2592627dbefb66cf1 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -252,6 +252,7 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" set(CODELETS_SRC codelets/codelet_convert.c + codelets/codelet_hgemm.c ${CODELETS_SRC} ) diff --git a/runtime/starpu/codelets/codelet_hgemm.c b/runtime/starpu/codelets/codelet_hgemm.c new file mode 100644 index 0000000000000000000000000000000000000000..a595bf990cd336e50aee8c541954ef5591c5ed8b --- /dev/null +++ b/runtime/starpu/codelets/codelet_hgemm.c @@ -0,0 +1,250 @@ +/** + * + * @file starpu/codelet_hgemm.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon hgemm StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" + +CHAMELEON_CL_CB( hgemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_hgemm_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + CUDA_hgemm( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (CHAMELEON_Real16_t*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (CHAMELEON_Real16_t*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_hgemm_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + HIP_hgemm( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (CHAMELEON_Real16_t*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (CHAMELEON_Real16_t*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); + + return; +} +#endif /* defined(CHAMELEON_USE_HIP) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( hgemm, NULL, cl_hgemm_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( hgemm, NULL, cl_hgemm_cuda_func, STARPU_CUDA_ASYNC ) +#endif + +void INSERT_TASK_hgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_hlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_hgemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "hgemm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_hgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_hgemm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_hgemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_hgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_hgemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "hgemm"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_hgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_hgemm_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_hgemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 36dda081062b9840531622c0c46e4de29a1eaaf1..ee4817fe4cc35abf5a73baffad8b224fcf79452a 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -11,7 +11,7 @@ * * @brief Chameleon StarPU descriptor routines * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede @@ -20,7 +20,7 @@ * @author Raphael Boucherie * @author Samuel Thibault * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-06 * */ #include "chameleon_starpu.h" @@ -101,6 +101,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) { int64_t lmt = desc->lmt; int64_t lnt = desc->lnt; + size_t nbtiles = lmt * lnt; desc->occurences = 1; @@ -108,8 +109,12 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) * Allocate starpu_handle_t array (handlers are initialized on the fly when * discovered by any algorithm to save space) */ - desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t)); - assert(desc->schedopt); + if ( cham_is_mixed( desc->dtyp ) ) { + nbtiles *= 3; + } + + desc->schedopt = (void*)calloc( nbtiles, sizeof(starpu_data_handle_t) ); + assert( desc->schedopt ); #if !defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) @@ -160,7 +165,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) */ { chameleon_starpu_tag_init(); - desc->mpitag = chameleon_starpu_tag_book( (int64_t)lnt * (int64_t)lmt ); + desc->mpitag = chameleon_starpu_tag_book( nbtiles ); if ( desc->mpitag == -1 ) { chameleon_fatal_error("RUNTIME_desc_create", "Can't pursue computation since no more tags are available"); @@ -181,41 +186,45 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc ) * If this is the last descriptor using the matrix, we release the handle * and unregister the GPU data */ - if ( desc->occurences == 0 ) { - starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); - int lmt = desc->lmt; - int lnt = desc->lnt; - int m, n; - - for (n = 0; n < lnt; n++) { - for (m = 0; m < lmt; m++) - { - if ( *handle != NULL ) { - starpu_data_unregister(*handle); - *handle = NULL; - } - handle++; - } + if ( desc->occurences > 0 ) { + return; + } + + starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); + int64_t lmt = desc->lmt; + int64_t lnt = desc->lnt; + int64_t nbtiles = lmt * lnt; + int64_t m; + + if ( cham_is_mixed( desc->dtyp ) ) { + nbtiles *= 3; + } + + for (m = 0; m < nbtiles; m++, handle++) + { + if ( *handle != NULL ) { + starpu_data_unregister(*handle); + *handle = NULL; } + } #if !defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) - if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + { + /* Unmap the pinned memory associated to the matrix */ + if (gpuHostUnregister(desc->mat) != gpuSuccess) { - /* Unmap the pinned memory associated to the matrix */ - if (gpuHostUnregister(desc->mat) != gpuSuccess) - { - chameleon_warning("RUNTIME_desc_destroy(StarPU)", - "gpuHostUnregister failed to unregister the " - "pinned memory associated to the matrix"); - } + chameleon_warning("RUNTIME_desc_destroy(StarPU)", + "gpuHostUnregister failed to unregister the " + "pinned memory associated to the matrix"); } + } #endif #endif - chameleon_starpu_tag_release( desc->mpitag ); + chameleon_starpu_tag_release( desc->mpitag ); - free( desc->schedopt ); - } + free( desc->schedopt ); } /** @@ -335,24 +344,37 @@ void RUNTIME_desc_flush( const CHAM_desc_t *desc, void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence, const CHAM_desc_t *A, int m, int n ) { + int local, i, imax = 1; int64_t mm = m + (A->i / A->mb); int64_t nn = n + (A->j / A->nb); - int64_t shift = ((int64_t)A->lmt) * nn + mm; + int64_t shift = ((int64_t)A->lmt) * nn + mm; + int64_t nbtiles = ((int64_t)(A->lmt)) * ((int64_t)(A->lnt)); starpu_data_handle_t *handle = A->schedopt; handle += shift; - if (*handle == NULL) { - return; - } + local = chameleon_desc_islocal( A, m, n ); + + if ( cham_is_mixed( A->dtyp ) ) { + imax = 3; + } + + for( i=0; i<imax; i++ ) { + starpu_data_handle_t *handlebis; + + handlebis = handle + i * nbtiles; + + if ( *handlebis == NULL ) { + continue; + } #if defined(CHAMELEON_USE_MPI) - starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle ); + starpu_mpi_cache_flush( MPI_COMM_WORLD, *handlebis ); #endif - if ( chameleon_desc_islocal( A, m, n ) ) { - chameleon_starpu_data_wont_use( *handle ); + if ( local ) { + chameleon_starpu_data_wont_use( *handlebis ); + } } - (void)sequence; } @@ -443,3 +465,71 @@ void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n ) assert( *ptrtile ); return (void*)(*ptrtile); } + +void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, + cham_access_t access, cham_flttype_t flttype, + const CHAM_desc_t *A, int m, int n ) +{ + int64_t mm = m + (A->i / A->mb); + int64_t nn = n + (A->j / A->nb); + + CHAM_tile_t *tile = A->get_blktile( A, m, n ); + starpu_data_handle_t *ptrtile = A->schedopt; + + int fltshift = (cham_get_arith( tile->flttype ) - cham_get_arith( flttype ) + 3 ) % 3; + int64_t shift = (int64_t)fltshift * ((int64_t)A->lmt * (int64_t)A->lnt); + shift = shift + ((int64_t)A->lmt) * nn + mm; + + /* Get the correct starpu_handle */ + ptrtile += shift; + + if ( *ptrtile != NULL ) { + return (void*)(*ptrtile); + } + + int home_node = -1; + int myrank = A->myrank; + int owner = A->get_rankof( A, m, n ); + + if ( myrank == owner ) { + if ( (tile->format & CHAMELEON_TILE_HMAT) || + (tile->mat != NULL) ) + { + home_node = STARPU_MAIN_RAM; + } + } + + starpu_cham_tile_register( ptrtile, home_node, tile, flttype ); + +#if defined(HAVE_STARPU_DATA_SET_OOC_FLAG) + if ( A->ooc == 0 ) { + starpu_data_set_ooc_flag( *ptrtile, 0 ); + } +#endif + +#if defined(HAVE_STARPU_DATA_SET_COORDINATES) + starpu_data_set_coordinates( *ptrtile, 3, m, n, cham_get_arith( flttype ) ); +#endif + +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( *ptrtile, A->mpitag + shift, owner ); +#endif /* defined(CHAMELEON_USE_MPI) */ + +#if defined(CHAMELEON_KERNELS_TRACE) + fprintf( stderr, "%s - %p registered with tag %ld\n", + tile->name, *ptrtile, A->mpitag + shift ); +#endif + assert( *ptrtile ); + + /* Submit the data conversion */ + if (( fltshift != 0 ) && (access & ChamR) && (owner == myrank) ) { + starpu_data_handle_t *fromtile = A->schedopt; + starpu_data_handle_t *totile = ptrtile; + + fromtile += ((int64_t)A->lmt) * nn + mm; + if ( *fromtile != NULL ) { + insert_task_convert( options, tile->m, tile->n, tile->flttype, *fromtile, flttype, *totile ); + } + } + return (void*)(*ptrtile); +} diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 8e77d14a9dea84153c6b0672d63cecb3c71ed077..8d421ddbb5234d073ba9ea71b3c97ff3046ff9fc 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -116,6 +116,10 @@ static inline int cham_to_starpu_access( cham_access_t accessA ) { return accessA; } +void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, + cham_access_t access, cham_flttype_t flttype, + const CHAM_desc_t *A, int m, int n ); + /* * MPI Redefinitions */ diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index f89b772939ee8155540a99e5c8ef7d8cb0154827..9c9af1b6bd7c451b3522682b20652816183ea5a7 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -151,6 +151,17 @@ #endif CODELETS_HEADER(map); +CODELETS_HEADER(hgemm); + +struct cl_hgemm_args_s { + cham_trans_t transA; + cham_trans_t transB; + int m; + int n; + int k; + CHAMELEON_Real16_t alpha; + CHAMELEON_Real16_t beta; +}; void insert_task_convert( const RUNTIME_option_t *options, diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index 27eb801cb266a0038e77b19e04e729b68b3270f4..1e837e1b0ca4c4a5e8418e9acf86dcfb71029851 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -672,6 +672,8 @@ starpu_cham_tile_register( starpu_data_handle_t *handleptr, .tilesize = tile->m * tile->n * elemsize, }; memcpy( &(cham_tile_interface.tile), tile, sizeof( CHAM_tile_t ) ); + /* Overwrite the flttype in case it comes from a data conversion */ + cham_tile_interface.tile.flttype = flttype; if ( tile->format & CHAMELEON_TILE_FULLRANK ) { cham_tile_interface.allocsize = tile->m * tile->n * elemsize;