diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 795ebd2d186f9c1e88a44ab6312d40583b1a4d5d..93a5f6e303c8b00076e78fbe7faf58fc59dfe4f7 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -562,4 +562,12 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index f7203fbe500d517ea64251ea198600944ce9291c..08279345b7f5d95ba633f3143f7c1b39fe2d6352 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -24,6 +24,7 @@ # @author Florent Pruvost # @author Philippe Virouleau # @author Matthieu Kuhn +# @author Alycia Lisito # @date 2024-03-16 # ### @@ -73,6 +74,7 @@ set(CODELETS_ZSRC codelets/codelet_zhe2ge.c codelets/codelet_zherfb.c codelets/codelet_zhessq.c + codelets/codelet_zipiv_allreduce.c codelets/codelet_zlacpy.c codelets/codelet_zlange.c codelets/codelet_zlanhe.c diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..9856258bba33499b06156fa83c2ceea00e0f6868 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -0,0 +1,169 @@ +/** + * + * @file starpu/codelet_zipiv_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +#if defined ( CHAMELEON_USE_MPI ) +struct cl_redux_args_t { + int h; + int n; + int k; +}; + +static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); + cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); + CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); + CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); + CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); + CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); + + cppi_display_dbg( cppi_me, stderr, "Global redux Inout: "); + cppi_display_dbg( cppi_src, stderr, "Global redux Input: "); + + assert( cppi_me->n == cppi_src->n ); + assert( cppi_me->h == cppi_src->h ); + assert( cppi_me->flttype == cppi_src->flttype ); + assert( cppi_me->arraysize == cppi_src->arraysize ); + + if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) { + nextpiv_me->blkm0 = nextpiv_src->blkm0; + nextpiv_me->blkidx = nextpiv_src->blkidx; + cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 ); + } + + /* Let's copy the diagonal row if needed */ + if ( ( cppi_src->has_diag == 1 ) && + ( cppi_me->has_diag == -1 ) ) + { + cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); + assert( cppi_src->arraysize == clargs->n * sizeof(CHAMELEON_Complex64_t) ); + cppi_me->has_diag = 1; + } + + cppi_display_dbg( cppi_me, stderr, "Global redux Inout(After): "); +} + +CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) + +void +INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h, + const RUNTIME_option_t *options ) +{ + rt_starpu_insert_task( + NULL, + STARPU_EXECUTE_ON_NODE, dst, + STARPU_R, RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_PRIORITY, options->priority, + 0 ); +} + +void +INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n, + const RUNTIME_option_t *options ) +{ + struct cl_redux_args_t *clargs; + clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs->h = h; + clargs->n = n; + clargs->k = k; + + rt_starpu_insert_task( + &cl_zipiv_allreduce, + STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_t), + STARPU_RW, RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, src, k, h ), + STARPU_EXECUTE_ON_NODE, me, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_PRIORITY, options->priority, + 0 ); + starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); +} + +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + int np_involved = chameleon_min( A->p, A->mt - k); + int np_iter = np_involved; + int p_recv, p_send, me; + int shift = 1; + + if ( h > 0 ) { + starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); + } + if ( h >= ipiv->n ) { + return; + } + + if ( np_involved == 1 ) { + assert( proc_involved[0] == A->myrank ); + } + else { + for( me = 0; me < np_involved; me++ ) { + if ( proc_involved[me] == A->myrank ) { + break; + } + } + assert( me < np_involved ); + while ( np_iter > 1 ) { + p_send = proc_involved[ ( me + shift ) % np_involved ]; + p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; + + INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h, options ); + INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options ); + + shift = shift << 1; + np_iter = chameleon_ceil( np_iter, 2 ); + } + } +} +#else +void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, + const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) +{ + if ( h > 0 ) { + starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); + } + + (void)options; + (void)proc_involved; + (void)n; +} +#endif diff --git a/runtime/starpu/include/cppi_interface.h b/runtime/starpu/include/cppi_interface.h index 7dbd10118c6bee637c9c49b6bc5bdf9d3fc008e2..8113c453fb2d344bda5972430e9e8337a3b0f880 100644 --- a/runtime/starpu/include/cppi_interface.h +++ b/runtime/starpu/include/cppi_interface.h @@ -12,6 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2023-08-22 * */ @@ -44,6 +45,7 @@ struct cppi_interface_s void cppi_interface_init(); void cppi_interface_fini(); +CHAM_pivot_t *cppi_handle_get( starpu_data_handle_t handle ); void cppi_register( starpu_data_handle_t *handleptr, cham_flttype_t flttype, int n, @@ -61,13 +63,14 @@ cppi_display_dbg( cppi_interface_t *cppi_interface, FILE *f, const char *title ) diagrow = cppi_interface->pivot.diagrow; pivrow = cppi_interface->pivot.pivrow; - fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d\n", + fprintf( f, "%sn=%2d, h=%2d, has_diag=%2d, m0=%2d, idx=%2d, interf = %p\n", title, cppi_interface->n, cppi_interface->h, cppi_interface->has_diag, cppi_interface->pivot.blkm0, - cppi_interface->pivot.blkidx ); + cppi_interface->pivot.blkidx, + cppi_interface ); fprintf(stderr, "Diagonal row: " ); for( i=0; i<cppi_interface->n; i++) { diff --git a/runtime/starpu/interface/cppi_interface.c b/runtime/starpu/interface/cppi_interface.c index 2d1754ec1cee030c040ee1c941cf283a6f58b284..6b1f8063180e78dbebf1ea443ee28f98920a7723 100644 --- a/runtime/starpu/interface/cppi_interface.c +++ b/runtime/starpu/interface/cppi_interface.c @@ -12,13 +12,14 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn + * @author Alycia Lisito * @date 2023-08-22 * */ #include "chameleon_starpu.h" #undef HAVE_STARPU_REUSE_DATA_ON_NODE -static inline CHAM_pivot_t * +CHAM_pivot_t * cppi_handle_get( starpu_data_handle_t handle ) { cppi_interface_t *cppi_interface = (cppi_interface_t *) @@ -38,7 +39,7 @@ cppi_init( void *data_interface ) cppi_interface_t *cppi_interface = (cppi_interface_t *)data_interface; cppi_interface->id = CPPI_INTERFACE_ID; cppi_interface->h = -1; - cppi_interface->has_diag = 0; + cppi_interface->has_diag = -1; } static void @@ -83,7 +84,7 @@ cppi_allocate_data_on_node( void *data_interface, unsigned node ) /* update the data properly in consequence */ cppi_interface->h = -1; - cppi_interface->has_diag = 0; + cppi_interface->has_diag = -1; cppi_interface->pivot.pivrow = dataptr; cppi_interface->pivot.diagrow = ((char*)dataptr) + cppi_interface->arraysize; @@ -279,8 +280,10 @@ cppi_describe( void *data_interface, char *buf, size_t size ) { cppi_interface_t *cppi_interface = (cppi_interface_t *) data_interface; - return snprintf( buf, size, "Pivot structure, n %d, blkm0 %d, blkidx %d", + return snprintf( buf, size, "Pivot structure, n %d, h %d, has_diag = %d, blkm0 %d, blkidx %d", cppi_interface->n, + cppi_interface->h, + cppi_interface->has_diag, cppi_interface->pivot.blkm0, cppi_interface->pivot.blkidx ); } @@ -298,6 +301,7 @@ cppi_copy_any_to_any( void *src_interface, unsigned src_node, STARPU_ASSERT( cppi_interface_src->flttype == cppi_interface_dst->flttype ); cppi_interface_dst->h = cppi_interface_src->h; + cppi_interface_dst->has_diag = cppi_interface_src->has_diag; cppi_interface_dst->pivot.blkm0 = cppi_interface_src->pivot.blkm0; cppi_interface_dst->pivot.blkidx = cppi_interface_src->pivot.blkidx; @@ -402,8 +406,8 @@ cl_cppi_redux_cpu_func(void *descr[], void *cl_arg) assert( cppi_redux->h == cppi_input->h ); /* Let's copy the diagonal row if needed */ - if ( cppi_input->has_diag ) { - assert( cppi_redux->has_diag == 0 ); + if ( cppi_input->has_diag == 1 ) { + assert( cppi_redux->has_diag == -1 ); memcpy( cppi_redux->pivot.diagrow, cppi_input->pivot.diagrow, @@ -449,7 +453,7 @@ cl_cppi_init_redux_cpu_func( void *descr[], void *cl_arg ) cppi_interface_t *cppi_redux = ((cppi_interface_t *) descr[0]); /* Redux pivot never has diagonal at initialization */ - cppi_redux->has_diag = 0; + cppi_redux->has_diag = -1; cppi_redux->h = -1; size_t size = cppi_redux->arraysize; @@ -497,7 +501,7 @@ cppi_register( starpu_data_handle_t *handleptr, .id = CPPI_INTERFACE_ID, .arraysize = n * CHAMELEON_Element_Size( flttype ), .flttype = flttype, - .has_diag = 0, + .has_diag = -1, .h = -1, .n = n, };