-
Mathieu Faverge authoredMathieu Faverge authored
runtime_descriptor.c 13.71 KiB
/**
*
* @file starpu/runtime_descriptor.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2025 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon StarPU descriptor routines
*
* @version 1.3.0
* @author Cedric Augonnet
* @author Mathieu Faverge
* @author Cedric Castagnede
* @author Florent Pruvost
* @author Guillaume Sylvand
* @author Raphael Boucherie
* @author Samuel Thibault
* @author Loris Lucido
* @date 2024-09-17
*
*/
#include "chameleon_starpu_internal.h"
/**
* Malloc/Free of the data
*/
#ifdef STARPU_MALLOC_SIMULATION_FOLDED
#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED
#else
#define FOLDED 0
#endif
void *RUNTIME_malloc( size_t size )
{
#if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
return (void*) 1;
#else
void *ptr;
if (starpu_malloc_flags(&ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0) {
return NULL;
}
return ptr;
#endif
}
void RUNTIME_free( void *ptr,
size_t size )
{
#if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
(void)ptr; (void)size;
return;
#else
starpu_free_flags(ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
#endif
}
#if defined(CHAMELEON_USE_CUDA)
#define gpuError_t cudaError_t
#define gpuHostRegister cudaHostRegister
#define gpuHostUnregister cudaHostUnregister
#define gpuHostRegisterPortable cudaHostRegisterPortable
#define gpuSuccess cudaSuccess
#define gpuGetErrorString cudaGetErrorString
#elif defined(CHAMELEON_USE_HIP)
#define gpuError_t hipError_t
#define gpuHostRegister hipHostRegister
#define gpuHostUnregister hipHostUnregister
#define gpuHostRegisterPortable hipHostRegisterPortable
#define gpuSuccess hipSuccess
#define gpuGetErrorString hipGetErrorString
#endif
/**
* Create data descriptor
*/
void RUNTIME_desc_create( CHAM_desc_t *desc )
{
int64_t lmt = desc->lmt;
int64_t lnt = desc->lnt;
size_t nbtiles = lmt * lnt;
desc->occurences = 1;
/*
* Allocate starpu_handle_t array (handlers are initialized on the fly when
* discovered by any algorithm to save space)
*/
if ( cham_is_mixed( desc->dtyp ) ) {
nbtiles *= 3;
}
desc->schedopt = (void*)calloc( nbtiles, sizeof(starpu_data_handle_t) );
assert( desc->schedopt );
#if !defined(CHAMELEON_SIMULATION)
#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
/*
* Register allocated memory as GPU pinned memory
*/
if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
{
int64_t eltsze = CHAMELEON_Element_Size(desc->dtyp);
size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze;
gpuError_t rc;
/* Register the matrix as pinned memory */
rc = gpuHostRegister( desc->mat, size, gpuHostRegisterPortable );
if ( rc != gpuSuccess )
{
/* Disable the unregister as register failed */
desc->register_mat = 0;
chameleon_warning("RUNTIME_desc_create(StarPU): gpuHostRegister - ", gpuGetErrorString( rc ));
}
}
#endif
#endif
if (desc->ooc) {
char *backend = getenv("STARPU_DISK_SWAP_BACKEND");
if (backend && strcmp(backend, "unistd_o_direct") == 0) {
int lastmm = desc->lm - (desc->lmt-1) * desc->mb;
int lastnn = desc->ln - (desc->lnt-1) * desc->nb;
int64_t eltsze = CHAMELEON_Element_Size(desc->dtyp);
int pagesize = getpagesize();
if ( ((desc->mb * desc->nb * eltsze) % pagesize != 0) ||
((lastmm * desc->nb * eltsze) % pagesize != 0) ||
((desc->mb * lastnn * eltsze) % pagesize != 0) ||
((lastmm * lastnn * eltsze) % pagesize != 0) )
{
chameleon_error("RUNTIME_desc_create", "Matrix and tile size not suitable for out-of-core: all tiles have to be multiples of the system page size. Tip : choose 'n' and 'nb' as both multiples of 32.");
return;
}
}
}
#if defined(CHAMELEON_USE_MPI)
/*
* Book the number of tags required to describe this matrix
*/
{
chameleon_starpu_tag_init( );
desc->mpitag = chameleon_starpu_tag_book( nbtiles );
if ( desc->mpitag == -1 ) {
chameleon_fatal_error("RUNTIME_desc_create", "Can't pursue computation since no more tags are available");
return;
}
}
#endif
}
/**
* Destroy data descriptor
*/
void RUNTIME_desc_destroy( CHAM_desc_t *desc )
{
desc->occurences--;
/*
* If this is the last descriptor using the matrix, we release the handle
* and unregister the GPU data
*/
if ( desc->occurences > 0 ) {
return;
}
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int64_t lmt = desc->lmt;
int64_t lnt = desc->lnt;
int64_t nbtiles = lmt * lnt;
int64_t m;
if ( cham_is_mixed( desc->dtyp ) ) {
nbtiles *= 3;
}
for (m = 0; m < nbtiles; m++, handle++)
{
if ( *handle != NULL ) {
starpu_data_unregister(*handle);
*handle = NULL;
}
}
#if !defined(CHAMELEON_SIMULATION)
#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
{
/* Unmap the pinned memory associated to the matrix */
if (gpuHostUnregister(desc->mat) != gpuSuccess)
{
chameleon_warning("RUNTIME_desc_destroy(StarPU)",
"gpuHostUnregister failed to unregister the "
"pinned memory associated to the matrix");
}
}
#endif
#endif
chameleon_starpu_tag_release( desc->mpitag );
free( desc->schedopt );
}
/**
* Acquire data
*/
int RUNTIME_desc_acquire( const CHAM_desc_t *desc )
{
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++) {
for (m = 0; m < lmt; m++)
{
if ( (*handle == NULL) ||
!chameleon_desc_islocal( desc, m, n ) )
{
handle++;
continue;
}
starpu_data_acquire(*handle, STARPU_R);
handle++;
}
}
return CHAMELEON_SUCCESS;
}
/**
* Release data
*/
int RUNTIME_desc_release( const CHAM_desc_t *desc )
{
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++) {
for (m = 0; m < lmt; m++)
{
if ( (*handle == NULL) ||
!chameleon_desc_islocal( desc, m, n ) )
{
handle++;
continue;
}
starpu_data_release(*handle);
handle++;
}
}
return CHAMELEON_SUCCESS;
}
/**
* Flush cached data
*/
void RUNTIME_flush( CHAM_context_t *chamctxt )
{
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_cache_flush_all_data( chamctxt->comm );
#endif
}
void RUNTIME_desc_flush( const CHAM_desc_t *desc,
const RUNTIME_sequence_t *sequence )
{
int mt = desc->mt;
int nt = desc->nt;
int m, n;
for (n = 0; n < nt; n++)
{
for (m = 0; m < mt; m++)
{
RUNTIME_data_flush( sequence, desc, m, n );
}
}
}
void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence,
const CHAM_desc_t *A, int m, int n )
{
int local, i, imax = 1;
int64_t mm = m + (A->i / A->mb);
int64_t nn = n + (A->j / A->nb);
int64_t shift = ((int64_t)A->lmt) * nn + mm;
int64_t nbtiles = ((int64_t)(A->lmt)) * ((int64_t)(A->lnt));
starpu_data_handle_t *handle = A->schedopt;
handle += shift;
local = chameleon_desc_islocal( A, m, n );
if ( cham_is_mixed( A->dtyp ) ) {
imax = 3;
}
for( i=0; i<imax; i++ ) {
starpu_data_handle_t *handlebis;
handlebis = handle + i * nbtiles;
if ( *handlebis == NULL ) {
continue;
}
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_cache_flush( sequence->comm, *handlebis );
#endif
if ( local ) {
chameleon_starpu_data_wont_use( *handlebis );
}
}
(void)sequence;
}
#if defined(CHAMELEON_USE_MIGRATE)
void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence,
const CHAM_desc_t *A, int Am, int An, int new_rank )
{
#if defined(HAVE_STARPU_MPI_DATA_MIGRATE)
int old_rank;
starpu_data_handle_t *handle = (starpu_data_handle_t*)(A->schedopt);
starpu_data_handle_t lhandle;
handle += ((int64_t)(A->lmt) * (int64_t)An + (int64_t)Am);
lhandle = *handle;
if ( lhandle == NULL ) {
/* Register the data */
lhandle = RUNTIME_data_getaddr( A, Am, An );
}
old_rank = starpu_mpi_data_get_rank( lhandle );
if ( old_rank != new_rank ) {
starpu_mpi_data_migrate( sequence->comm, lhandle, new_rank );
}
(void)sequence;
#else
(void)sequence; (void)A; (void)Am; (void)An; (void)new_rank;
#endif
}
#endif
/**
* Get data addr
*/
/* For older revision of StarPU, STARPU_MAIN_RAM is not defined */
#ifndef STARPU_MAIN_RAM
#define STARPU_MAIN_RAM 0
#endif
void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n )
{
int64_t mm = m + (A->i / A->mb);
int64_t nn = n + (A->j / A->nb);
starpu_data_handle_t *ptrtile = A->schedopt;
ptrtile += ((int64_t)A->lmt) * nn + mm;
if ( *ptrtile != NULL ) {
return (void*)(*ptrtile);
}
int home_node = -1;
int myrank = A->myrank;
int owner = A->get_rankof( A, m, n );
CHAM_tile_t *tile = A->get_blktile( A, m, n );
if ( myrank == owner ) {
if ( (tile->format & CHAMELEON_TILE_HMAT) ||
(tile->mat != NULL) )
{
home_node = STARPU_MAIN_RAM;
}
}
starpu_cham_tile_register( ptrtile, home_node, tile, cham_get_flttype( A->dtyp ) );
#if defined(HAVE_STARPU_DATA_SET_OOC_FLAG)
if ( A->ooc == 0 ) {
starpu_data_set_ooc_flag( *ptrtile, 0 );
}
#endif
#if defined(HAVE_STARPU_DATA_SET_COORDINATES)
starpu_data_set_coordinates( *ptrtile, 2, m, n );
#endif
#if defined(CHAMELEON_USE_MPI)
{
int64_t block_ind = A->lmt * nn + mm;
starpu_mpi_data_register( *ptrtile, A->mpitag + block_ind, owner );
}
#endif /* defined(CHAMELEON_USE_MPI) */
#if defined(CHAMELEON_KERNELS_TRACE)
fprintf( stderr, "%s - %p registered with tag %ld\n",
tile->name, (void*)(*ptrtile), A->mpitag + A->lmt * nn + mm );
#endif
assert( *ptrtile );
return (void*)(*ptrtile);
}
void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
cham_access_t access, cham_flttype_t flttype,
const CHAM_desc_t *A, int m, int n )
{
int64_t mm = m + (A->i / A->mb);
int64_t nn = n + (A->j / A->nb);
CHAM_tile_t *tile = A->get_blktile( A, m, n );
starpu_data_handle_t *ptrtile = A->schedopt;
int fltshift = (cham_get_arith( tile->flttype ) - cham_get_arith( flttype ) + 3 ) % 3;
int64_t shift = (int64_t)fltshift * ((int64_t)A->lmt * (int64_t)A->lnt);
shift = shift + ((int64_t)A->lmt) * nn + mm;
/* Get the correct starpu_handle */
ptrtile += shift;
/* Invalidate copies on write access */
if ( access & ChamW ) {
starpu_data_handle_t *copy = ptrtile;
assert( fltshift == 0 );
/* Remove first copy */
copy += ((int64_t)A->lmt * (int64_t)A->lnt);
if ( *copy ) {
starpu_data_unregister_no_coherency( *copy );
*copy = NULL;
}
/* Remove second copy */
copy += ((int64_t)A->lmt * (int64_t)A->lnt);
if ( *copy ) {
starpu_data_unregister_no_coherency( *copy );
*copy = NULL;
}
}
if ( *ptrtile != NULL ) {
return (void*)(*ptrtile);
}
int home_node = -1;
int myrank = A->myrank;
int owner = A->get_rankof( A, m, n );
if ( (myrank == owner) && (shift == 0) ) {
if ( (tile->format & CHAMELEON_TILE_HMAT) ||
(tile->mat != NULL) )
{
home_node = STARPU_MAIN_RAM;
}
}
starpu_cham_tile_register( ptrtile, home_node, tile, flttype );
#if defined(HAVE_STARPU_DATA_SET_OOC_FLAG)
if ( A->ooc == 0 ) {
starpu_data_set_ooc_flag( *ptrtile, 0 );
}
#endif
#if defined(HAVE_STARPU_DATA_SET_COORDINATES)
starpu_data_set_coordinates( *ptrtile, 3, m, n, cham_get_arith( flttype ) );
#endif
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_data_register( *ptrtile, A->mpitag + shift, owner );
#endif /* defined(CHAMELEON_USE_MPI) */
#if defined(CHAMELEON_KERNELS_TRACE)
fprintf( stderr, "%s - %p registered with tag %ld\n",
tile->name, (void*)(*ptrtile), A->mpitag + shift );
#endif
assert( *ptrtile );
/* Submit the data conversion */
if (( fltshift != 0 ) && (access & ChamR) && (owner == myrank) ) {
starpu_data_handle_t *fromtile = A->schedopt;
starpu_data_handle_t *totile = ptrtile;
fromtile += ((int64_t)A->lmt) * nn + mm;
assert( fromtile != totile );
assert( tile->flttype != flttype );
if ( *fromtile != NULL ) {
insert_task_convert( options, tile->m, tile->n, tile->flttype, *fromtile, flttype, *totile );
}
}
return (void*)(*ptrtile);
}