diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 6db9a8a40148a67fcaeda74f9a718c949b12e59d..4347f5710f23d5ff450f9eb77bf7f7f5c9f41205 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -150,7 +150,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, } /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } /* Flush temporary data used for the pivoting */ @@ -196,7 +196,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } free( clargs ); @@ -250,7 +250,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) { INSERT_TASK_zgetrf_blocked_trsm( @@ -312,7 +312,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( (b < (nbblock-1)) && (h == hmax-1) ) { INSERT_TASK_zgetrf_blocked_trsm( diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 5f1bbcd322293e3104fca313974096bcf711de71..402c92a3f6d9dcb4ff8e1039b64b62ef051a4a7c 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -575,13 +575,13 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ); + int n, + void *ws ); /** ******************************************************************************** diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c index b088283254cd64e1bada1628939436327b8a2789..197842ea3e96fdba1a9e1d67152a8a5b3e6196ea 100644 --- a/runtime/openmp/codelets/codelet_zipiv_allreduce.c +++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_openmp.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c index 75e0611647a464cad9c37e59a5619ebefaae19ed..d6bd3f4c06baf9b1c44e4db6971c88c09acd432f 100644 --- a/runtime/parsec/codelets/codelet_zipiv_allreduce.c +++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_parsec.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c index e88269e931f3f210282a1382d44a6ff9516c7453..0186fd142b67d08dcfca01e9b8184b471362ce1c 100644 --- a/runtime/quark/codelets/codelet_zipiv_allreduce.c +++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_quark.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c index a81f0d08eef1fb94b6846606b5e63aae64ab075c..48ecdd0c33fa07f9cfd326f775b0f31fb48a67b8 100644 --- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -22,18 +22,18 @@ struct cl_redux_args_t { int h; int n; - int k; }; -static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +static void +zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me, + cppi_interface_t *cppi_src, + int h, + int n ) { - struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; - cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); - cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); - CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); - CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); - CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); - CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); + CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); + CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); + CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); + CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); cppi_display_dbg( cppi_me, stderr, "Global redux Inout: "); cppi_display_dbg( cppi_src, stderr, "Global redux Input: "); @@ -43,33 +43,42 @@ static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) assert( cppi_me->flttype == cppi_src->flttype ); assert( cppi_me->arraysize == cppi_src->arraysize ); - if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) { + if ( cabs( pivrow_src[ h ] ) > cabs( pivrow_me[ h ] ) ) { nextpiv_me->blkm0 = nextpiv_src->blkm0; nextpiv_me->blkidx = nextpiv_src->blkidx; - cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 ); + cblas_zcopy( n, pivrow_src, 1, pivrow_me, 1 ); } /* Let's copy the diagonal row if needed */ if ( ( cppi_src->has_diag == 1 ) && ( cppi_me->has_diag == -1 ) ) { - cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); - assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * clargs->n ); + cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); + assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n ); cppi_me->has_diag = 1; } cppi_display_dbg( cppi_me, stderr, "Global redux Inout(After): "); } +static void +cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); + cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); + zipiv_allreduce_cpu_func( cppi_me, cppi_src, clargs->h, clargs->n ); +} + CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) -void -INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, - int me, - int dst, - int k, - int h, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h ) { rt_starpu_insert_task( NULL, @@ -79,20 +88,19 @@ INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, 0 ); } -void -INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, - int me, - int src, - int k, - int h, - int n, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n ) { struct cl_redux_args_t *clargs; - clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs = malloc( sizeof( struct cl_redux_args_t ) ); clargs->h = h; clargs->n = n; - clargs->k = k; rt_starpu_insert_task( &cl_zipiv_allreduce, @@ -106,16 +114,17 @@ INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); } -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +static void +zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) { - int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); - int np_iter = np_involved; + int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); + int np_iter = np_involved; int p_recv, p_send, me; int shift = 1; @@ -140,29 +149,48 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, p_send = proc_involved[ ( me + shift ) % np_involved ]; p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; - INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h, options ); - INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options ); + INSERT_TASK_zipiv_allreduce_send( options, ipiv, A->myrank, p_send, k, h ); + INSERT_TASK_zipiv_allreduce_recv( options, ipiv, A->myrank, p_recv, k, h, n ); shift = shift << 1; np_iter = chameleon_ceil( np_iter, 2 ); } } } + +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + cham_getrf_allreduce_t alg = tmp->alg_allreduce; + switch( alg ) { + case ChamStarPUTasks: + default: + zipiv_allreduce_chameleon_starpu_task( options, A, ipiv, tmp->proc_involved, k, h, n ); + } +} #else -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) { if ( h > 0 ) { starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); } (void)options; - (void)proc_involved; + (void)ws; (void)n; } #endif