Newer
Older
*
* @file runtime_descriptor.c
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
***
*
* @brief Chameleon StarPU descriptor routines
* @author Cedric Augonnet
* @author Mathieu Faverge
* @author Cedric Castagnede
* @date 2010-11-15
*

Mathieu Faverge
committed
*/
#include <stdlib.h>

Mathieu Faverge
committed
/**
* Set the tag sizes

Mathieu Faverge
committed
*/
#if defined(CHAMELEON_USE_MPI)
/* Take 24 bits for the tile id, and 7 bits for descriptor id.

Mathieu Faverge
committed
These values can be changed through the call MORSE_user_tag_size(int tag_width, int tag_sep) */
#define TAG_WIDTH_MIN 20
static int tag_width = 31;
static int tag_sep = 24;
static int _tag_mpi_initialized_ = 0;
static inline int
chameleon_starpu_tag_init( int user_tag_width,
int user_tag_sep )
{
if (!_tag_mpi_initialized_) {
int *tag_ub = NULL;
int ok = 0;
tag_width = user_tag_width;
tag_sep = user_tag_sep;
#if defined(HAVE_STARPU_MPI_COMM_GET_ATTR)
starpu_mpi_comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub, &ok);
#else
MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub, &ok);
#endif
if ( !ok ) {
morse_error("RUNTIME_desc_create", "MPI_TAG_UB not known by StarPU");
}
while ( ((uintptr_t)((1UL<<tag_width) - 1) > (uintptr_t)(*tag_ub) ) &&
(tag_width >= TAG_WIDTH_MIN) )
{
tag_width--;
tag_sep--;
}
if ( tag_width < TAG_WIDTH_MIN ) {
morse_error("RUNTIME_desc_create", "MPI_TAG_UB too small to identify all the data");
}
_tag_mpi_initialized_ = 1;
return MORSE_SUCCESS;
}
else {
return MORSE_ERR_REINITIALIZED;
}
}
#ifndef HAVE_STARPU_MPI_DATA_REGISTER

Mathieu Faverge
committed
#define starpu_mpi_data_register( handle_, tag_, owner_ ) \
do { \
starpu_data_set_rank( (handle_), (owner_) ); \
starpu_data_set_tag( (handle_), (tag_) ); \
} while(0)
#endif
#endif
void RUNTIME_comm_set_tag_sizes( int user_tag_width,
int user_tag_sep )
{
int rc;
rc = chameleon_starpu_tag_init( user_tag_width, user_tag_sep );
if ( rc != MORSE_SUCCESS ) {
morse_error("RUNTIME_user_tag_size",
"must be called before creating any Morse descriptor with MORSE_Desc_create(). The tag sizes will not be modified.");
}
(void)user_tag_width; (void)user_tag_sep;
}

Mathieu Faverge
committed
/**
* Malloc/Free of the data

Mathieu Faverge
committed
*/
#ifdef STARPU_MALLOC_SIMULATION_FOLDED
#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED
#else
#define FOLDED 0
#endif
void *RUNTIME_malloc( size_t size )
{
#if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
return (void*) 1;
#else
if (starpu_malloc_flags(&ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0) {
return NULL;
}
return ptr;
#endif
}
void RUNTIME_free( void *ptr,
size_t size )
{
#if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
(void)ptr; (void)size;
starpu_free_flags(ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
#endif
}

Mathieu Faverge
committed
/**
* Create data descriptor

Mathieu Faverge
committed
*/
void RUNTIME_desc_create( MORSE_desc_t *desc )
{
int64_t lmt = desc->lmt;
int64_t lnt = desc->lnt;
starpu_data_handle_t *tiles;

Mathieu Faverge
committed
(void)tiles;
desc->occurences = 1;
/*
* Allocate starpu_handle_t array (handlers are initialized on the fly when
* discovered by any algorithm to save space)
*/
desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t));
assert(desc->schedopt);
tiles = (starpu_data_handle_t*)(desc->schedopt);
#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
/*
* Register allocated memory as CUDA pinned memory
*/
if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
{
int64_t eltsze = MORSE_Element_Size(desc->dtyp);
size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze;
cudaError_t rc;
/* Register the matrix as pinned memory */
rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable );
if ( rc != cudaSuccess )
{
/* Disable the unregister as register failed */
desc->register_mat = 0;
morse_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc ));
}
}
#endif
if (desc->ooc) {
int lastmm = desc->lm - (desc->lmt-1) * desc->mb;
int lastnn = desc->ln - (desc->lnt-1) * desc->nb;
int64_t eltsze = MORSE_Element_Size(desc->dtyp);
int pagesize = getpagesize();
if ( ((desc->mb * desc->nb * eltsze) % pagesize != 0) ||
((lastmm * desc->nb * eltsze) % pagesize != 0) ||
((desc->mb * lastnn * eltsze) % pagesize != 0) ||
((lastmm * lastnn * eltsze) % pagesize != 0) )
{
morse_error("RUNTIME_desc_create", "Matrix and tile size not suitable for out-of-core: all tiles have to be multiples of 4096. Tip : choose 'n' and 'nb' as both multiples of 32.");
return;
}
}
#if defined(CHAMELEON_USE_MPI)
/*
* Check that we are not going over MPI tag limitations
*/
{
/* Check that we won't create overflow in tags used */

Mathieu Faverge
committed
if ( ((uintptr_t)(lnt*lmt)) > ((uintptr_t)(1UL<<tag_sep)) ) {
SYLVAND Guillaume
committed
morse_fatal_error("RUNTIME_desc_create", "Too many tiles in the descriptor for MPI tags");
return;
}
assert(lmt*lmt<=(1<<tag_sep));

Mathieu Faverge
committed
if ( ((uintptr_t)desc->id) >= (uintptr_t)(1UL<<(tag_width-tag_sep)) ) {
SYLVAND Guillaume
committed
morse_fatal_error("RUNTIME_desc_create", "Number of descriptor available in MPI mode out of stock");
return;
}

Mathieu Faverge
committed
assert( ((uintptr_t)desc->id) < (uintptr_t)(1UL<<(tag_width-tag_sep)) );
}
#endif
}

Mathieu Faverge
committed
/**
* Destroy data descriptor

Mathieu Faverge
committed
*/
void RUNTIME_desc_destroy( MORSE_desc_t *desc )
{
desc->occurences--;
/*
* If this is the last descriptor using the matrix, we release the handle
* and unregister the GPU data
*/
if ( desc->occurences == 0 ) {
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++) {
for (m = 0; m < lmt; m++)
{
if (*handle == NULL)
{
handle++;
continue;
}
starpu_data_unregister(*handle);
handle++;
}
#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
{
/* Unmap the pinned memory associated to the matrix */
if (cudaHostUnregister(desc->mat) != cudaSuccess)
{
morse_warning("RUNTIME_desc_destroy(StarPU)",
"cudaHostUnregister failed to unregister the "
"pinned memory associated to the matrix");
}
}
#endif /* defined(CHAMELEON_USE_CUDA) */
free(desc->schedopt);
}
}

Mathieu Faverge
committed
/**

Mathieu Faverge
committed
*/
int RUNTIME_desc_acquire( const MORSE_desc_t *desc )
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++) {
for (m = 0; m < lmt; m++)
{
if ( (*handle == NULL) ||
!morse_desc_islocal( desc, m, n ) )
{
handle++;
continue;
}
starpu_data_acquire(*handle, STARPU_R);
handle++;
}
}
return MORSE_SUCCESS;
}

Mathieu Faverge
committed
/**

Mathieu Faverge
committed
*/
int RUNTIME_desc_release( const MORSE_desc_t *desc )
{
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++) {
for (m = 0; m < lmt; m++)
{
if ( (*handle == NULL) ||
!morse_desc_islocal( desc, m, n ) )
{
handle++;
continue;
}
starpu_data_release(*handle);
handle++;
}
return MORSE_SUCCESS;
}

Mathieu Faverge
committed
/**

Mathieu Faverge
committed
*/
void RUNTIME_flush()
{
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
#endif
}

Mathieu Faverge
committed
/**
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
* Different implementations of the flush call based on StarPU version
*/
#ifdef HAVE_STARPU_DATA_WONT_USE
static inline void
chameleon_starpu_data_wont_use( starpu_data_handle_t handle ) {
starpu_data_wont_use( handle );
}
#elif defined HAVE_STARPU_IDLE_PREFETCH
static inline void
chameleon_starpu_data_flush( starpu_data_handle_t handle)
{
starpu_data_idle_prefetch_on_node(handle, STARPU_MAIN_RAM, 1);
starpu_data_release_on_node(handle, -1);
}
static inline void
chameleon_starpu_data_wont_use( starpu_data_handle_t handle ) {
starpu_data_acquire_on_node_cb( handle, -1, STARPU_R,
chameleon_starpu_data_flush, handle );
}
#else
static inline void
chameleon_starpu_data_wont_use( starpu_data_handle_t handle ) {
starpu_data_acquire_cb( handle, STARPU_R,
(void (*)(void*))&starpu_data_release, handle );
#endif
void RUNTIME_desc_flush( const MORSE_desc_t *desc,
const MORSE_sequence_t *sequence )
{
starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
int lmt = desc->lmt;
int lnt = desc->lnt;
int m, n;
for (n = 0; n < lnt; n++)
{
for (m = 0; m < lmt; m++, handle++)
continue;
}
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
#endif
if ( morse_desc_islocal( desc, m, n ) ) {
chameleon_starpu_data_wont_use( *handle );
}
}
void RUNTIME_data_flush( const MORSE_sequence_t *sequence,
int64_t mm = m + (A->i / A->mb);
int64_t nn = n + (A->j / A->nb);
starpu_data_handle_t *handle = A->schedopt;
handle += ((int64_t)A->lmt) * nn + mm;
#if defined(CHAMELEON_USE_MPI)
starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
#endif
chameleon_starpu_data_wont_use( *handle );
}
#if defined(CHAMELEON_USE_MIGRATE)
void RUNTIME_data_migrate( const MORSE_sequence_t *sequence,
const MORSE_desc_t *A, int Am, int An, int new_rank )
{
starpu_data_handle_t *handle = (starpu_data_handle_t*)(A->schedopt);
starpu_data_handle_t lhandle;
handle += ((int64_t)(A->lmt) * (int64_t)An + (int64_t)Am);
lhandle = *handle;
if ( lhandle == NULL ) {
/* Register the data */
lhandle = RUNTIME_data_getaddr( A, Am, An );
}
starpu_mpi_data_migrate( MPI_COMM_WORLD, lhandle, new_rank );
(void)sequence;
}
#endif

Mathieu Faverge
committed
/**

Mathieu Faverge
committed
*/
/* For older revision of StarPU, STARPU_MAIN_RAM is not defined */
#ifndef STARPU_MAIN_RAM
#define STARPU_MAIN_RAM 0
#endif
void *RUNTIME_data_getaddr( const MORSE_desc_t *A, int m, int n )
int64_t mm = m + (A->i / A->mb);
int64_t nn = n + (A->j / A->nb);
starpu_data_handle_t *ptrtile = A->schedopt;
ptrtile += ((int64_t)A->lmt) * nn + mm;
if (*ptrtile == NULL) {
int home_node = -1;
void *user_ptr = NULL;
int myrank = A->myrank;
int owner = A->get_rankof( A, m, n );
int64_t eltsze = MORSE_Element_Size(A->dtyp);
int tempmm = (mm == A->lmt-1) ? (A->lm - mm * A->mb) : A->mb;
int tempnn = (nn == A->lnt-1) ? (A->ln - nn * A->nb) : A->nb;
if ( myrank == owner ) {
if ( user_ptr != NULL ) {
home_node = STARPU_MAIN_RAM;

PRUVOST Florent
committed
}
}
starpu_matrix_data_register( ptrtile, home_node, (uintptr_t) user_ptr,
tempmm, tempnn, eltsze );
#ifdef HAVE_STARPU_DATA_SET_COORDINATES
starpu_data_set_coordinates( *ptrtile, 2, m, n );
#if defined(CHAMELEON_USE_MPI)

Mathieu Faverge
committed
{
int64_t block_ind = A->lmt * nn + mm;
starpu_mpi_data_register(*ptrtile, (A->id << tag_sep) | (block_ind), owner);

Mathieu Faverge
committed
}
#endif /* defined(CHAMELEON_USE_MPI) */
}