diff --git a/control/auxiliary.c b/control/auxiliary.c index 00c6bdc59428844f8c34c96217aa0f8f237b484a..b71dafa5236f4b707abc73e07aaa2a4f3853e375 100644 --- a/control/auxiliary.c +++ b/control/auxiliary.c @@ -107,7 +107,7 @@ void morse_fatal_error(const char *func_name, const char *msg_text) **/ int morse_rank(MORSE_context_t *morse) { - return RUNTIME_rank( morse ); + return RUNTIME_thread_rank( morse ); } /******************************************************************************* @@ -233,31 +233,36 @@ int MORSE_My_Mpi_Rank(void) return MORSE_SUCCESS; #endif } + /******************************************************************************* * Display a progress percentage in stderr **/ void update_progress(int currentValue, int maximumValue) { - div_t res ; - static int progress = -1; /* varie de 0 a 100 au cours du calcul concerne */ + div_t res ; + static int progress = -1; /* varie de 0 a 100 au cours du calcul concerne */ - if (maximumValue==0) - res.quot=100 ; - else { - if (currentValue<INT_MAX/100) - res=div(currentValue*100, maximumValue) ; - /* Calcule le quotient de la division */ - else - res.quot=(int)( (long long) currentValue*100/maximumValue) ; - } + if (maximumValue == 0) { + res.quot = 100; + } + else { + if (currentValue < (INT_MAX / 100) ) { + res = div(currentValue*100, maximumValue); + } + else { + /* Calcule le quotient de la division */ + res.quot = (int)( (long long)( currentValue * 100 ) / maximumValue ); + } + } - // Print the percentage - if (res.quot > progress) - fprintf(stderr, "%3d%%\b\b\b\b", res.quot) ; - progress=res.quot ; + // Print the percentage + if (res.quot > progress) { + fprintf(stderr, "%3d%%\b\b\b\b", res.quot); + } + progress = res.quot; - if (currentValue>=maximumValue) { - progress=-1 ; - } + if (currentValue >= maximumValue) { + progress = -1; + } } // A function to display the progress indicator. diff --git a/control/control.c b/control/control.c index 4a9dff466b7dcc8061573678991a07194a0b2de7..fb742b5b491646be64a30fa46818d365bf056ca7 100644 --- a/control/control.c +++ b/control/control.c @@ -111,7 +111,7 @@ int MORSE_InitPar(int ncpus, int ncudas, int nthreads_per_worker) } # endif #endif - RUNTIME_init_scheduler( morse, ncpus, ncudas, nthreads_per_worker ); + RUNTIME_init( morse, ncpus, ncudas, nthreads_per_worker ); return MORSE_SUCCESS; } @@ -138,7 +138,7 @@ int MORSE_Finalize(void) # if !defined(CHAMELEON_SIMULATION) RUNTIME_barrier(morse); # endif - RUNTIME_finalize_scheduler( morse ); + RUNTIME_finalize( morse ); #if defined(CHAMELEON_USE_MPI) if (!morse->mpi_outer_init) @@ -250,14 +250,19 @@ int MORSE_Distributed_stop(void) * ****************************************************************************** * - * @return - * \retval MORSE_SUCCESS successful exit + * @retval The size of the distributed computation + * @retval -1 if context not initialized * *****************************************************************************/ -int MORSE_Comm_size( int *size ) +int MORSE_Comm_size() { - RUNTIME_comm_size (size); - return MORSE_SUCCESS; + MORSE_context_t *morse = morse_context_self(); + if (morse == NULL) { + morse_error("MORSE_Comm_size()", "MORSE not initialized"); + return -1; + } + + return RUNTIME_comm_size( morse ); } /** *************************************************************************** @@ -268,14 +273,19 @@ int MORSE_Comm_size( int *size ) * ****************************************************************************** * - * @return - * \retval MORSE_SUCCESS successful exit + * @retval The rank of the distributed computation + * @retval -1 if context not initialized * *****************************************************************************/ -int MORSE_Comm_rank( int *rank ) +int MORSE_Comm_rank() { - RUNTIME_comm_rank (rank); - return MORSE_SUCCESS; + MORSE_context_t *morse = morse_context_self(); + if (morse == NULL) { + morse_error("MORSE_Comm_rank()", "MORSE not initialized"); + return -1; + } + + return RUNTIME_comm_rank( morse ); } /** *************************************************************************** @@ -293,5 +303,11 @@ int MORSE_Comm_rank( int *rank ) *****************************************************************************/ int MORSE_GetThreadNbr( ) { - return RUNTIME_get_thread_nbr(); + MORSE_context_t *morse = morse_context_self(); + if (morse == NULL) { + morse_error("MORSE_GetThreadNbr()", "MORSE not initialized"); + return -1; + } + + return RUNTIME_thread_size( morse ); } diff --git a/control/descriptor.c b/control/descriptor.c index 26a0d4747c511cb80f920b10bdc331f2a319d014..bec07637c288d0db4bc7e8acd4f1d348046586f2 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -110,7 +110,17 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz, int (*get_blkldd) ( const MORSE_desc_t*, int ), int (*get_rankof) ( const MORSE_desc_t*, int, int )) { + MORSE_context_t *morse; MORSE_desc_t desc; + + memset( &desc, 0, sizeof(MORSE_desc_t) ); + + morse = morse_context_self(); + if (morse == NULL) { + morse_error("MORSE_Desc_Create", "MORSE not initialized"); + return desc; + } + // If one of the function get_* is NULL, we switch back to the default, like in morse_desc_init() desc.get_blkaddr = get_blkaddr ? get_blkaddr : morse_getaddr_ccrb; desc.get_blkldd = get_blkldd ? get_blkldd : morse_getblkldd_ccrb; @@ -144,7 +154,7 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz, desc.register_mat = 1; desc.ooc = 0; - RUNTIME_comm_rank( &(desc.myrank) ); + desc.myrank = RUNTIME_comm_rank( morse ); // Grid size desc.p = p; @@ -185,8 +195,6 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz, desc.A12 = (size_t)( desc.llm%mb)*(size_t)(desc.lln - desc.lln%nb) + desc.A21; desc.A22 = (size_t)(desc.llm - desc.llm%mb)*(size_t)( desc.lln%nb) + desc.A12; - RUNTIME_desc_init( &desc ); - return desc; } @@ -241,7 +249,8 @@ MORSE_desc_t* morse_desc_submatrix(MORSE_desc_t *descA, int i, int j, int m, int descB->mt = (m == 0) ? 0 : (descB->i+m-1)/mb - descB->i/mb + 1; descB->nt = (n == 0) ? 0 : (descB->j+n-1)/nb - descB->j/nb + 1; - RUNTIME_desc_submatrix( descB ); + // Increase the number of occurences to avoid multiple free of runtime specific data structures. + descB->occurences++; return descB; } @@ -301,7 +310,7 @@ int morse_desc_mat_alloc( MORSE_desc_t *desc ) size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * (size_t)MORSE_Element_Size(desc->dtyp); - if ((desc->mat = RUNTIME_mat_alloc(size)) == NULL) { + if ((desc->mat = RUNTIME_malloc(size)) == NULL) { morse_error("morse_desc_mat_alloc", "malloc() failed"); return MORSE_ERR_OUT_OF_RESOURCES; } @@ -327,7 +336,7 @@ int morse_desc_mat_free( MORSE_desc_t *desc ) size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * (size_t)MORSE_Element_Size(desc->dtyp); - RUNTIME_mat_free(desc->mat, size); + RUNTIME_free(desc->mat, size); desc->mat = NULL; } return MORSE_SUCCESS; @@ -423,7 +432,7 @@ int MORSE_Desc_Create(MORSE_desc_t **descptr, void *mat, MORSE_enum dtyp, int mb size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * (size_t)MORSE_Element_Size(desc->dtyp); - if ((desc->mat = RUNTIME_mat_alloc(size)) == NULL) { + if ((desc->mat = RUNTIME_malloc(size)) == NULL) { morse_error("MORSE_Desc_Create", "malloc() failed"); return MORSE_ERR_OUT_OF_RESOURCES; } @@ -843,6 +852,6 @@ int MORSE_Desc_Getoncpu(MORSE_desc_t *desc) { * *****************************************************************************/ void MORSE_user_tag_size(int user_tag_width, int user_tag_sep) { - RUNTIME_user_tag_size(user_tag_width, user_tag_sep); + RUNTIME_comm_set_tag_sizes( user_tag_width, user_tag_sep ); return; } diff --git a/include/chameleon/morse_runtime.h b/include/chameleon/morse_runtime.h index 051410897f603fc3d522d5ca0f1731553270e00a..dbcde462a94fb2c6bfca875e77731f8dca19a0fb 100644 --- a/include/chameleon/morse_runtime.h +++ b/include/chameleon/morse_runtime.h @@ -155,6 +155,15 @@ RUNTIME_resume( MORSE_context_t *ctxt ); void RUNTIME_barrier( MORSE_context_t *ctxt ); +/** + * @brief Show the progress of the computations when enabled. + * + * @param[in] ctxt + * The Chameleon context for which the context needs to be printed. + */ +void +RUNTIME_progress( MORSE_context_t *ctxt ); + /** * @brief Get the rank of the current worker for the runtime. * @@ -405,14 +414,21 @@ RUNTIME_desc_getoncpu( const MORSE_desc_t *desc ); * This function is a asynchronous call that submit the data movement from * remote memory to the main memory. This call must be completed by a call to * RUNTIME_sequence_wait() to ensure that all data have been moved. + * Users should avoid to call this function as it sequentially moves back the + * data from outside the main memory to main memory, and should prefer + * RUNTIME_desc_getoncpu_async(). * * @param[in] desc * The descriptor to release. * + * @param[in] sequence + * The sequence to which submit the data movements + * * @retval MORSE_SUCCESS on success */ int -RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc ); +RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc, + MORSE_sequence_t *sequence ); /** * @brief Get the pointer to the data or the runtime handler associated to the diff --git a/include/morse.h b/include/morse.h index dbce6b27964b7f26898cf34ad4fc4a8b178300f4..6d745743fdebf87f8c14dd903fb2f6b7e1982aa1 100644 --- a/include/morse.h +++ b/include/morse.h @@ -84,8 +84,8 @@ int MORSE_Pause (void); int MORSE_Resume (void); int MORSE_Distributed_start (void); int MORSE_Distributed_stop (void); -int MORSE_Comm_size (int *size); -int MORSE_Comm_rank (int *rank); +int MORSE_Comm_size (void); +int MORSE_Comm_rank (void); int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A); int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA); int MORSE_Distributed_start (void); diff --git a/runtime/starpu/codelets/codelet_dataflush.c b/runtime/starpu/codelets/codelet_dataflush.c index be96e164ca891e1ec4c16091a22455604a9b83c2..976c5dfca15c2b05f62590d68f2e4cb53a2ec5d9 100644 --- a/runtime/starpu/codelets/codelet_dataflush.c +++ b/runtime/starpu/codelets/codelet_dataflush.c @@ -56,8 +56,8 @@ int RUNTIME_desc_iscached(const MORSE_desc_t *A, int Am, int An) #endif #endif -void MORSE_TASK_flush_data(const MORSE_option_t *options, - const MORSE_desc_t *A, int Am, int An) +void MORSE_TASK_flush_data( const MORSE_option_t *options, + const MORSE_desc_t *A, int Am, int An ) { (void)options; diff --git a/runtime/starpu/control/runtime_async.c b/runtime/starpu/control/runtime_async.c index 4d9b646acad528cca09a0bbb578f6e433f99dceb..23efccf5feef4ce1628f118aa54b37c821e1e3e8 100644 --- a/runtime/starpu/control/runtime_async.c +++ b/runtime/starpu/control/runtime_async.c @@ -28,7 +28,8 @@ /******************************************************************************* * Create a sequence **/ -int RUNTIME_sequence_create( MORSE_context_t *morse, MORSE_sequence_t *sequence ) +int RUNTIME_sequence_create( MORSE_context_t *morse, + MORSE_sequence_t *sequence ) { (void)morse; (void)sequence; @@ -38,60 +39,27 @@ int RUNTIME_sequence_create( MORSE_context_t *morse, MORSE_sequence_t *sequence /******************************************************************************* * Destroy a sequence **/ -int RUNTIME_sequence_destroy( MORSE_context_t *morse, MORSE_sequence_t *sequence ) +int RUNTIME_sequence_destroy( MORSE_context_t *morse, + MORSE_sequence_t *sequence ) { (void)morse; (void)sequence; return MORSE_SUCCESS; } -// Defined in control/auxilliary.c -extern void (*update_progress_callback)(int, int) ; - -// no progress indicator for algorithms faster than 'PROGRESS_MINIMUM_DURATION' seconds -#define PROGRESS_MINIMUM_DURATION 10 - -/******************************************************************************* - * Display a progress information when executing the tasks - **/ -int RUNTIME_progress( MORSE_context_t *morse ) -{ - int tasksLeft, current, timer = 0; - int max; - -#if defined(CHAMELEON_USE_MPI) - if ( morse->my_mpi_rank != 0 ) - return MORSE_SUCCESS; -#endif - - max = starpu_task_nsubmitted(); - if ( max == 0 ) - return MORSE_SUCCESS; - - // update_progress_callback(0, max); - while ((tasksLeft = starpu_task_nsubmitted()) > 0) { - current = max - tasksLeft; - if (timer > PROGRESS_MINIMUM_DURATION) - update_progress_callback(current, max); - sleep(1); - timer++; - } - if (timer > PROGRESS_MINIMUM_DURATION) - update_progress_callback(max, max); - - (void)morse; - return MORSE_SUCCESS; -} - /******************************************************************************* * Wait for the completion of a sequence **/ -int RUNTIME_sequence_wait( MORSE_context_t *morse, MORSE_sequence_t *sequence ) +int RUNTIME_sequence_wait( MORSE_context_t *morse, + MORSE_sequence_t *sequence ) { (void)morse; (void)sequence; - if (morse->progress_enabled) + + if (morse->progress_enabled) { RUNTIME_progress(morse); + } + starpu_task_wait_for_all(); #if defined(CHAMELEON_USE_MPI) starpu_mpi_barrier(MPI_COMM_WORLD); @@ -102,9 +70,12 @@ int RUNTIME_sequence_wait( MORSE_context_t *morse, MORSE_sequence_t *sequence ) /******************************************************************************* * Terminate a sequence **/ -void RUNTIME_sequence_flush( void *schedopt, MORSE_sequence_t *sequence, MORSE_request_t *request, int status) +void RUNTIME_sequence_flush( MORSE_context_t *morse, + MORSE_sequence_t *sequence, + MORSE_request_t *request, + int status ) { - (void)schedopt; + (void)morse; sequence->request = request; sequence->status = status; request->status = status; diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c index a96a071e2c35c8f462dfac901316a3713764a754..e14ef69c6fa1f38f06a128a121c20d56b4bcdab6 100644 --- a/runtime/starpu/control/runtime_context.c +++ b/runtime/starpu/control/runtime_context.c @@ -62,7 +62,6 @@ void RUNTIME_context_create( MORSE_context_t *morse ) /******************************************************************************* * Clean the context **/ - void RUNTIME_context_destroy( MORSE_context_t *morse ) { /* StarPU was already initialized by an external library */ diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index bbf792875153712d4ca6cbae90a53635d09a2aea..aec18aa50cb8c162f655fc3af0a89f65d5a9b7d3 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -27,28 +27,13 @@ #include <stdlib.h> #include "chameleon_starpu.h" -#if defined(CHAMELEON_SIMULATION) -# ifndef STARPU_SIMGRID -# error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support." -# endif -#else -# ifdef STARPU_SIMGRID -# warning "Starpu was built with simgrid support. Better build Chameleon with simulation support (-DCHAMELEON_SIMULATION=YES)." -# endif -#endif -/******************************************************************************* - * Thread rank. - **/ -int RUNTIME_rank(MORSE_context_t *morse) -{ - (void)morse; - return starpu_worker_get_id(); -} - /******************************************************************************* * **/ -int RUNTIME_init_scheduler( MORSE_context_t *morse, int ncpus, int ncudas, int nthreads_per_worker) +int RUNTIME_init( MORSE_context_t *morse, + int ncpus, + int ncudas, + int nthreads_per_worker ) { starpu_conf_t *conf = (starpu_conf_t*)(morse->schedopt); int hres = -1; @@ -137,18 +122,19 @@ int RUNTIME_init_scheduler( MORSE_context_t *morse, int ncpus, int ncudas, int n /******************************************************************************* * */ -void RUNTIME_finalize_scheduler( MORSE_context_t *morse ) +void RUNTIME_finalize( MORSE_context_t *morse ) { (void)morse; /* StarPU was already initialized by an external library */ - if (morse->schedopt == NULL) { + if ( morse->schedopt == NULL ) { return; } #if defined(CHAMELEON_USE_MPI) starpu_mpi_shutdown(); #endif + #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) starpu_cublas_shutdown(); #endif @@ -157,6 +143,27 @@ void RUNTIME_finalize_scheduler( MORSE_context_t *morse ) return; } +/******************************************************************************* + * To suspend the processing of new tasks by workers + **/ +void RUNTIME_pause( MORSE_context_t *morse ) +{ + (void)morse; + starpu_pause(); + return; +} + +/******************************************************************************* + * This is the symmetrical call to RUNTIME_pause, + * used to resume the workers polling for new tasks. + **/ +void RUNTIME_resume( MORSE_context_t *morse ) +{ + (void)morse; + starpu_resume(); + return; +} + /******************************************************************************* * Busy-waiting barrier **/ @@ -169,84 +176,102 @@ void RUNTIME_barrier( MORSE_context_t *morse ) #endif } +// Defined in control/auxilliary.c +extern void (*update_progress_callback)(int, int); + +// no progress indicator for algorithms faster than 'PROGRESS_MINIMUM_DURATION' seconds +#define PROGRESS_MINIMUM_DURATION 10 + /******************************************************************************* - * Set iteration numbers for traces + * Display a progress information when executing the tasks **/ -void RUNTIME_iteration_push( MORSE_context_t *morse, unsigned long iteration ) +void RUNTIME_progress( MORSE_context_t *morse ) { - (void)morse; -#if defined(HAVE_STARPU_ITERATION_PUSH) - starpu_iteration_push(iteration); + int tasksLeft, current, timer = 0; + int max; + +#if defined(CHAMELEON_USE_MPI) + if ( morse->my_mpi_rank != 0 ) { + return; + } #endif -} -void RUNTIME_iteration_pop( MORSE_context_t *morse ) -{ + max = starpu_task_nsubmitted(); + if ( max == 0 ) { + return; + } + + // update_progress_callback(0, max); + while ((tasksLeft = starpu_task_nsubmitted()) > 0) { + current = max - tasksLeft; + if (timer > PROGRESS_MINIMUM_DURATION) { + update_progress_callback(current, max); + } + sleep(1); + timer++; + } + if (timer > PROGRESS_MINIMUM_DURATION) { + update_progress_callback(max, max); + } + (void)morse; -#if defined(HAVE_STARPU_ITERATION_PUSH) - starpu_iteration_pop(); -#endif + return; } /******************************************************************************* - * To suspend the processing of new tasks by workers + * Thread rank. **/ -void RUNTIME_pause( MORSE_context_t *morse ) +int RUNTIME_thread_rank( MORSE_context_t *morse ) { (void)morse; - starpu_pause(); - return; + return starpu_worker_get_id(); } /******************************************************************************* - * This is the symmetrical call to RUNTIME_pause, - * used to resume the workers polling for new tasks. + * Thread rank. **/ -void RUNTIME_resume( MORSE_context_t *morse ) +int RUNTIME_thread_size( MORSE_context_t *morse ) { (void)morse; - starpu_resume(); - return; + return starpu_worker_get_count_by_type( STARPU_CPU_WORKER ); } /******************************************************************************* - * This returns the rank of this process + * The process rank **/ -void RUNTIME_comm_rank( int *rank ) +int RUNTIME_comm_rank( MORSE_context_t *morse ) { + int rank; #if defined(CHAMELEON_USE_MPI) # if defined(HAVE_STARPU_MPI_COMM_RANK) - starpu_mpi_comm_rank(MPI_COMM_WORLD, rank); + starpu_mpi_comm_rank( MPI_COMM_WORLD, &rank ); # else - MPI_Comm_rank(MPI_COMM_WORLD, rank); + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); # endif #else - *rank = 0; + rank = 0; #endif - return; + + (void)morse; + return rank; } /******************************************************************************* * This returns the size of the distributed computation **/ -void RUNTIME_comm_size( int *size ) +int RUNTIME_comm_size( MORSE_context_t *morse ) { + int size; #if defined(CHAMELEON_USE_MPI) # if defined(HAVE_STARPU_MPI_COMM_RANK) - starpu_mpi_comm_size(MPI_COMM_WORLD, size); + starpu_mpi_comm_size( MPI_COMM_WORLD, &size ); # else - MPI_Comm_size(MPI_COMM_WORLD, size); + MPI_Comm_size( MPI_COMM_WORLD, &size ); # endif #else - *size = 1; + size = 1; #endif - return; -} -/******************************************************************************* - * This returns the number of workers - **/ -int RUNTIME_get_thread_nbr() -{ - return starpu_worker_get_count_by_type( STARPU_CPU_WORKER ); + (void)morse; + return size; } diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index b14888842d1e0b81cd54df2371268c37a1f383ce..a1e434e8ec10bc17f0c750b00970e8c54146a9f8 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -25,6 +25,9 @@ #include <unistd.h> #include "chameleon_starpu.h" +/******************************************************************************* + * Set the tag sizes + **/ #if defined(CHAMELEON_USE_MPI) /* Take 24 bits for the tile id, and 7 bits for descriptor id. @@ -44,13 +47,9 @@ static int _tag_mpi_initialized_ = 0; #endif -#ifdef STARPU_MALLOC_SIMULATION_FOLDED -#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED -#else -#define FOLDED 0 -#endif - -void RUNTIME_user_tag_size( int user_tag_width, int user_tag_sep ) { +void RUNTIME_comm_set_tag_sizes( int user_tag_width, + int user_tag_sep ) +{ #if defined(CHAMELEON_USE_MPI) if (_tag_mpi_initialized_ == 0) { tag_width = user_tag_width; @@ -63,29 +62,43 @@ void RUNTIME_user_tag_size( int user_tag_width, int user_tag_sep ) { (void)user_tag_width; (void)user_tag_sep; } +/******************************************************************************* + * Malloc/Free of the data + **/ +#ifdef STARPU_MALLOC_SIMULATION_FOLDED +#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED +#else +#define FOLDED 0 +#endif -void *RUNTIME_mat_alloc( size_t size ) +void *RUNTIME_malloc( size_t size ) { #if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI) return (void*) 1; #else - void *mat; + void *ptr; - if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0) + if (starpu_malloc_flags(&ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0) { return NULL; - return mat; + } + return ptr; #endif } -void RUNTIME_mat_free( void *mat, size_t size ) +void RUNTIME_free( void *ptr, + size_t size ) { #if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI) + (void)ptr; (void)size; return; #else - starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT); + starpu_free_flags(ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT); #endif } +/******************************************************************************* + * Create data descriptor + **/ void RUNTIME_desc_create( MORSE_desc_t *desc ) { int64_t lmt = desc->lmt; @@ -104,34 +117,34 @@ void RUNTIME_desc_create( MORSE_desc_t *desc ) tiles = (starpu_data_handle_t*)(desc->schedopt); #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) - if (desc->use_mat == 1 && desc->register_mat == 1){ - /* - * Register allocated memory as CUDA pinned memory - */ - { - int64_t eltsze = MORSE_Element_Size(desc->dtyp); - size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze; - cudaError_t rc; + /* + * Register allocated memory as CUDA pinned memory + */ + if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + { + int64_t eltsze = MORSE_Element_Size(desc->dtyp); + size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze; + cudaError_t rc; - /* Register the matrix as pinned memory */ - rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable ); - if ( rc != cudaSuccess ) - { - morse_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc )); - } + /* Register the matrix as pinned memory */ + rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable ); + if ( rc != cudaSuccess ) + { + morse_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc )); } } #endif - if (desc->ooc) { - int lastmm = desc->lm - (desc->lmt-1) * desc->mb; - int lastnn = desc->ln - (desc->lnt-1) * desc->nb; - int64_t eltsze = MORSE_Element_Size(desc->dtyp); - int pagesize = getpagesize(); - if ((desc->mb * desc->nb * eltsze) % pagesize != 0 - || (lastmm * desc->nb * eltsze) % pagesize != 0 - || (desc->mb * lastnn * eltsze) % pagesize != 0 - || (lastmm * lastnn * eltsze) % pagesize != 0) + if (desc->ooc) { + int lastmm = desc->lm - (desc->lmt-1) * desc->mb; + int lastnn = desc->ln - (desc->lnt-1) * desc->nb; + int64_t eltsze = MORSE_Element_Size(desc->dtyp); + int pagesize = getpagesize(); + + if ( ((desc->mb * desc->nb * eltsze) % pagesize != 0) || + ((lastmm * desc->nb * eltsze) % pagesize != 0) || + ((desc->mb * lastnn * eltsze) % pagesize != 0) || + ((lastmm * lastnn * eltsze) % pagesize != 0) ) { morse_error("RUNTIME_desc_create", "Matrix and tile size not suitable for out-of-core: all tiles have to be multiples of 4096. Tip : choose 'n' and 'nb' as both multiples of 32."); return; @@ -153,8 +166,9 @@ void RUNTIME_desc_create( MORSE_desc_t *desc ) morse_error("RUNTIME_desc_create", "MPI_TAG_UB not known by MPI"); } - while ( ((uintptr_t)((1UL<<tag_width) - 1) > (uintptr_t)(*tag_ub) ) - && (tag_width >= TAG_WIDTH_MIN) ) { + while ( ((uintptr_t)((1UL<<tag_width) - 1) > (uintptr_t)(*tag_ub) ) && + (tag_width >= TAG_WIDTH_MIN) ) + { tag_width--; tag_sep--; } @@ -183,6 +197,9 @@ void RUNTIME_desc_create( MORSE_desc_t *desc ) #endif } +/******************************************************************************* + * Destroy data descriptor + **/ void RUNTIME_desc_destroy( MORSE_desc_t *desc ) { desc->occurences--; @@ -197,7 +214,7 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc ) int lnt = desc->lnt; int m, n; - for (n = 0; n < lnt; n++) + for (n = 0; n < lnt; n++) { for (m = 0; m < lmt; m++) { if (*handle == NULL) @@ -205,13 +222,14 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc ) handle++; continue; } - //printf("\nUnregister %d %d %d", MORSE_My_Mpi_Rank(), m, n); starpu_data_unregister(*handle); handle++; } + } #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) - if (desc->use_mat == 1 && desc->register_mat == 1){ + if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + { /* Unmap the pinned memory associated to the matrix */ if (cudaHostUnregister(desc->mat) != cudaSuccess) { @@ -226,28 +244,43 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc ) } } -void RUNTIME_desc_init( MORSE_desc_t *desc ) +/******************************************************************************* + * Acquire data + **/ +int RUNTIME_desc_acquire( const MORSE_desc_t *desc ) { - (void)desc; - return; -} + starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); + int lmt = desc->lmt; + int lnt = desc->lnt; + int m, n; -void RUNTIME_desc_submatrix( MORSE_desc_t *desc ) -{ - desc->occurences++; - return; + for (n = 0; n < lnt; n++) { + for (m = 0; m < lmt; m++) + { + if ( (*handle == NULL) || + !morse_desc_islocal( desc, m, n ) ) + { + handle++; + continue; + } + starpu_data_acquire(*handle, STARPU_R); + handle++; + } + } + return MORSE_SUCCESS; } -/* TODO: Acquire/Release/GetonCPU need to be studied carefully and fixed - * because we are not using them correctly */ -int RUNTIME_desc_acquire( MORSE_desc_t *desc ) +/******************************************************************************* + * Release data + **/ +int RUNTIME_desc_release( const MORSE_desc_t *desc ) { starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); int lmt = desc->lmt; int lnt = desc->lnt; int m, n; - for (n = 0; n < lnt; n++) + for (n = 0; n < lnt; n++) { for (m = 0; m < lmt; m++) { if ( (*handle == NULL) || @@ -256,20 +289,31 @@ int RUNTIME_desc_acquire( MORSE_desc_t *desc ) handle++; continue; } - starpu_data_acquire(*handle, STARPU_R); + starpu_data_release(*handle); handle++; } + } return MORSE_SUCCESS; } -int RUNTIME_desc_release( MORSE_desc_t *desc ) +/******************************************************************************* + * Get data on cpu - Synchronous call + **/ +int RUNTIME_desc_getoncpu( const MORSE_desc_t *desc ) { starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); int lmt = desc->lmt; int lnt = desc->lnt; int m, n; - for (n = 0; n < lnt; n++) + if ( desc->ooc ) { + /* May not even fit */ + morse_warning( "RUNTIME_desc_getoncpu(StarPU)", + "Try to get an out-of-core matrix on main memory. Cancelled as it might not fit" ); + return MORSE_SUCCESS; + } + + for (n = 0; n < lnt; n++) { for (m = 0; m < lmt; m++) { if ( (*handle == NULL) || @@ -278,31 +322,34 @@ int RUNTIME_desc_release( MORSE_desc_t *desc ) handle++; continue; } + + starpu_data_acquire(*handle, STARPU_R); starpu_data_release(*handle); handle++; } + } return MORSE_SUCCESS; } -/** - * For older revision of StarPU, STARPU_MAIN_RAM is not defined - */ -#ifndef STARPU_MAIN_RAM -#define STARPU_MAIN_RAM 0 -#endif - -int RUNTIME_desc_getoncpu( MORSE_desc_t *desc ) +/******************************************************************************* + * Get data on cpu - Asynchronous call + **/ +int RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc, + MORSE_sequence_t *sequence ) { starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); int lmt = desc->lmt; int lnt = desc->lnt; int m, n; - if (desc->ooc) + if ( desc->ooc ) { /* May not even fit */ + morse_warning( "RUNTIME_desc_getoncpu_async(StarPU)", + "Try to get an out-of-core matrix on main memory. Cancelled as it might not fit" ); return MORSE_SUCCESS; + } - for (n = 0; n < lnt; n++) + for (n = 0; n < lnt; n++) { for (m = 0; m < lmt; m++) { if ( (*handle == NULL) || @@ -312,14 +359,25 @@ int RUNTIME_desc_getoncpu( MORSE_desc_t *desc ) continue; } - starpu_data_acquire(*handle, STARPU_R); - starpu_data_release(*handle); + starpu_data_acquire_cb( *handle, STARPU_R, + (void (*)(void*))&starpu_data_release, *handle ); handle++; } + } + (void)sequence; return MORSE_SUCCESS; } +/******************************************************************************* + * Get data addr + **/ + +/* For older revision of StarPU, STARPU_MAIN_RAM is not defined */ +#ifndef STARPU_MAIN_RAM +#define STARPU_MAIN_RAM 0 +#endif + void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n ) { int64_t im = m + (desc->i / desc->mb); @@ -344,12 +402,12 @@ void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n ) } } - starpu_matrix_data_register(ptrtile, home_node, (uintptr_t) user_ptr, - BLKLDD(desc, im), - tempmm, tempnn, eltsze); + starpu_matrix_data_register( ptrtile, home_node, (uintptr_t) user_ptr, + BLKLDD(desc, im), + tempmm, tempnn, eltsze ); #ifdef HAVE_STARPU_DATA_SET_COORDINATES - starpu_data_set_coordinates(*ptrtile, 2, m, n); + starpu_data_set_coordinates( *ptrtile, 2, m, n ); #endif #if defined(CHAMELEON_USE_MPI) diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c index 4b9c700d50c9c2430d1094f1ec4b81fedabd10a4..e3021a766ec937a999c2c9d1b624ebc52fe55140 100644 --- a/runtime/starpu/control/runtime_profiling.c +++ b/runtime/starpu/control/runtime_profiling.c @@ -40,6 +40,25 @@ double RUNTIME_get_time(){ return starpu_timing_now()*1e-6; } +/******************************************************************************* + * Set iteration numbers for traces + **/ +void RUNTIME_iteration_push( MORSE_context_t *morse, unsigned long iteration ) +{ + (void)morse; +#if defined(HAVE_STARPU_ITERATION_PUSH) + starpu_iteration_push(iteration); +#endif +} + +void RUNTIME_iteration_pop( MORSE_context_t *morse ) +{ + (void)morse; +#if defined(HAVE_STARPU_ITERATION_PUSH) + starpu_iteration_pop(); +#endif +} + void RUNTIME_start_profiling(){ #if defined(HAVE_STARPU_FXT_PROFILING) starpu_fxt_start_profiling(); diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 5c24359b83490110900a61e680381b7115a5b210..82a71f96bde9802b9ec23e303f977c50b31a6d65 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -59,6 +59,16 @@ #endif #endif +#if defined(CHAMELEON_SIMULATION) +# if !defined(STARPU_SIMGRID) +# error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support." +# endif +#else +# if defined(STARPU_SIMGRID) +# warning "Starpu was built with simgrid support. Better build Chameleon with simulation support (-DCHAMELEON_SIMULATION=YES)." +# endif +#endif + #include "control/common.h" #include "runtime_codelets.h" #include "runtime_profiling.h"