diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 169d038b57ae520073ce244f8b82d902ef1366ff..d69abaecb3ea9de4f21b5b34c5789f44f4b1380f 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -59,7 +59,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) { CHAM_context_t *chamctxt; struct chameleon_pzgetrf_s *ws; - int lookahead; + int lookahead, batch_size; chamctxt = chameleon_context_self(); if ( chamctxt == NULL ) { @@ -114,21 +114,16 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( allreduce ); } - ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", 0 ); - if ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS2 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS2 value\n" ); - ws->batch_size_blas2 = CHAMELEON_BATCH_SIZE; - } - ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", 0 ); - if ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS3 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS3 value\n" ); - ws->batch_size_blas3 = CHAMELEON_BATCH_SIZE; - } - ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", 0 ); - if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_SWAP must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_SWAP value\n" ); - ws->batch_size_swap = CHAMELEON_BATCH_SIZE; + batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); + if ( batch_size > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); } + ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", batch_size ); + ws->batch_size_blas2 = ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas2; + ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", batch_size ); + ws->batch_size_blas3 = ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas3; + ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", batch_size ); + ws->batch_size_swap = ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_swap; ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); diff --git a/include/chameleon/flops.h b/include/chameleon/flops.h index dacb47113a618dfaf49ab20f9133d4cfb19da720..0635491bfbdc3c93046b5bd9fa13b418ac8b728d 100644 --- a/include/chameleon/flops.h +++ b/include/chameleon/flops.h @@ -71,10 +71,12 @@ #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.)) #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.)) - #define FMULS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRMM_2((__m), (__n)) : FMULS_TRMM_2((__n), (__m)) ) #define FADDS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FADDS_TRMM_2((__m), (__n)) : FADDS_TRMM_2((__n), (__m)) ) +#define FMULS_TRSM_UNIT_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.)) +#define FMULS_TRSM_UNIT(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRSM_UNIT_2((__m), (__n)) : FMULS_TRSM_UNIT_2((__n), (__m)) ) + #define FMULS_TRSM FMULS_TRMM #define FADDS_TRSM FADDS_TRMM @@ -236,6 +238,11 @@ static inline double flops_ctrsm( cham_side_t __side, double __m, double __n) { static inline double flops_dtrsm( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } static inline double flops_strsm( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_ztrsm_unit( cham_side_t __side, double __m, double __n) { double flops = (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_ctrsm_unit( cham_side_t __side, double __m, double __n) { double flops = (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_dtrsm_unit( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM_UNIT(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } +static inline double flops_strsm_unit( cham_side_t __side, double __m, double __n) { double flops = ( FMULS_TRSM_UNIT(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ); return flops; } + /* * Lapack */ @@ -347,10 +354,68 @@ static inline double flops_cgebrd( double __m, double __n) { double flops = (6. static inline double flops_dgebrd( double __m, double __n) { double flops = ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ); return flops; } static inline double flops_sgebrd( double __m, double __n) { double flops = ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ); return flops; } +static inline double flops_zscal( double __m ) { double flops = (6. * (double)(__m)); return flops; } +static inline double flops_cscal( double __m ) { double flops = (6. * (double)(__m)); return flops; } +static inline double flops_dscal( double __m ) { double flops = ( (double)(__m)); return flops; } +static inline double flops_sscal( double __m ) { double flops = ( (double)(__m)); return flops; } + /* * Norms */ #define FMULS_LANGE(__m, __n) ((double)(__m) * (double)(__n)) #define FADDS_LANGE(__m, __n) ((double)(__m) * (double)(__n)) +/* + * Getrf with partial pivoting + */ +#define FLOPS_GETRF_BLOCKED_OFFDIAG( _prec_ ) \ + static inline double flops_##_prec_##getrf_blocked_offdiag( int m, int n, int h, int ib ) \ + { \ + double flops = 0.; \ + int kk, nn; \ + if ( h == 0 ) { \ + return 0.; \ + } \ + /* scal */ \ + flops += flops_##_prec_##scal( m ); \ + /* blas 3 gemm */ \ + if ( h % ib == 0 ) { \ + kk = ib; \ + nn = n - h; \ + } \ + /* blas 2 geru */ \ + else { \ + kk = 1; \ + nn = ib - h % ib; \ + } \ + flops += flops_##_prec_##gemm( m, nn, kk ); \ + return flops; \ + } + +FLOPS_GETRF_BLOCKED_OFFDIAG( z ) +FLOPS_GETRF_BLOCKED_OFFDIAG( c ) +FLOPS_GETRF_BLOCKED_OFFDIAG( d ) +FLOPS_GETRF_BLOCKED_OFFDIAG( s ) + +/* +1 for the 1/pivot */ +static inline double flops_zgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; } +static inline double flops_cgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; } +static inline double flops_dgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; } +static inline double flops_sgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; } + +static inline double flops_zgetrf_percol_diag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; } +static inline double flops_cgetrf_percol_diag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; } +static inline double flops_dgetrf_percol_diag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; } +static inline double flops_sgetrf_percol_diag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; } + +static inline double flops_zgetrf_percol_offdiag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_cgetrf_percol_offdiag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_dgetrf_percol_offdiag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m, n, h, n ); } +static inline double flops_sgetrf_percol_offdiag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m, n, h, n ); } + +static inline double flops_zgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ztrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_cgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ctrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_dgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_dtrsm_unit( ChamLeft, ib, n-h ) : 0.; } +static inline double flops_sgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_strsm_unit( ChamLeft, ib, n-h ) : 0.; } + #endif /* _flops_h_ */ diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index aace97617b754005d6a12c3dd55641006cb94f5a..0cdc6cafaf4060132b0b770c4d532eab085c8e5f 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -78,6 +78,7 @@ cl_zgeadd_cuda_func( void *descr[], void *cl_arg ) CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC ) #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, @@ -133,7 +134,7 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, @@ -156,8 +157,8 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, accessB ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, accessB ); /* * Not involved, let's return @@ -209,4 +210,4 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index de400f20da43773e00b960a6fb02cfa640726137..1fe76c9b782c0b52de548f1d851e17250c732dc5 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -127,6 +127,7 @@ CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, @@ -297,7 +298,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, 0 ); } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void __INSERT_TASK_zgemm( const RUNTIME_option_t *options, int xrank, int accessC, @@ -319,9 +320,9 @@ void __INSERT_TASK_zgemm( const RUNTIME_option_t *options, * Register the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, xrank ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -425,4 +426,5 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, B, Bm, Bn, beta, C, Cm, Cn ); } -#endif + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 4bb70d4b45da5d93cd6f0c1e168109e27d872b84..0ff4ed9854228109928e30ae4b34013338a32a5c 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -24,7 +24,7 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" -struct cl_getrf_batched_args_t { +struct cl_zgetrf_batched_args_s { const char *cl_name; int tasks_nbr; int diag; @@ -36,16 +36,39 @@ struct cl_getrf_batched_args_t { struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE]; }; +static inline double flops_zgetrf_percol_batched( int *m, int *n, int h, int t ) +{ + double flops = 0.; + int k; + for ( k = 0; k < t; k ++ ) { + flops += flops_zgetrf_percol_offdiag( m[k], n[k], h ); + } + return flops; +} + +static inline double flops_zgetrf_blocked_batched( int *m, int *n, int h, int ib, int d, int t ) +{ + double flops = 0.; + int k; + if ( d == 1 ) { + flops += flops_zgetrf_blocked_diag( m[0]-h, n[0], h, ib ); + } + for ( k = d; k < t; k ++ ) { + flops += flops_zgetrf_blocked_offdiag( m[k]-h, n[k], h, ib ); + } + return flops; +} + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[], void *cl_arg ) { - struct cl_getrf_batched_args_t *clargs = (struct cl_getrf_batched_args_t *) cl_arg; - cppi_interface_t *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ]; - cppi_interface_t *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ]; - int i, m, n, h, m0, lda; - CHAM_tile_t *tileA; + struct cl_zgetrf_batched_args_s *clargs = (struct cl_zgetrf_batched_args_s *) cl_arg; + cppi_interface_t *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ]; + cppi_interface_t *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ]; + int i, m, n, h, m0, lda; + CHAM_tile_t *tileA; nextpiv->h = clargs->h; nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag ); @@ -73,19 +96,18 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2; - void (*callback)(void*) = NULL; - struct cl_getrf_batched_args_t *clargs = *clargs_ptr; - int rankA = A->get_rankof( A, Am, An ); - if ( rankA != A->myrank ) { - return; - } #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_percol_offdiag_batched: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + int task_num = 0; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2; + struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr; + int rankA = A->get_rankof( A, Am, An ); + if ( rankA != A->myrank ) { + return; + } /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -93,8 +115,8 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, CHAMELEON_END_ACCESS_DECLARATION; if ( clargs == NULL ) { - clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ; - memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) ); + clargs = malloc( sizeof( struct cl_zgetrf_batched_args_s ) ) ; + memset( clargs, 0, sizeof( struct cl_zgetrf_batched_args_s ) ); clargs->tasks_nbr = 0; clargs->h = h; clargs->cl_name = "zgetrf_panel_offdiag_batched"; @@ -114,39 +136,26 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); if ( clargs->tasks_nbr == batch_size ) { - int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; - int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; - rt_starpu_insert_task( - &cl_zgetrf_panel_offdiag_batched, - /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), - STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, - access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, - STARPU_EXECUTE_ON_WORKER, options->workerid, - 0 ); - - /* clargs is freed by starpu. */ - *clargs_ptr = NULL; + INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, An, clargs_ptr, ipiv ); } } +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, CHAM_desc_t *A, int An, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - void (*callback)(void*) = NULL; - struct cl_getrf_batched_args_t *clargs = *clargs_ptr; - int rankA = A->myrank; #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_percol_offdiag_batched: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + void (*callback)(void*) = NULL; + struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr; + int rankA = A->myrank; if ( clargs == NULL ) { return; @@ -157,7 +166,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zgetrf_panel_offdiag_batched, /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_batched_args_s), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ), @@ -171,12 +180,75 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, *clargs_ptr = NULL; } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void +INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + struct cl_zgetrf_batched_args_s *myclargs = *clargs_ptr; + int rankA = A->myrank; + int k, ret, access_npiv, access_ppiv; + struct starpu_task *task; + + if ( myclargs == NULL ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_panel_percol_offdiag_batched, zgetrf_panel_offdiag_batched, zgetrf_batched, myclargs->tasks_nbr + 2 ); + + access_npiv = ( myclargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ppiv = ( myclargs->h == 0 ) ? STARPU_NONE : STARPU_R; + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + for ( k = 0; k < myclargs->tasks_nbr; k++ ) { + starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW ); + } + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h-1 ), access_ppiv ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + task->cl_arg = myclargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_batched_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = flops_zgetrf_percol_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->tasks_nbr ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + (void)clargs; + (void)cl_name; +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[], void *cl_arg ) { - struct cl_getrf_batched_args_t *clargs = ( struct cl_getrf_batched_args_t * ) cl_arg; + struct cl_zgetrf_batched_args_s *clargs = ( struct cl_zgetrf_batched_args_s * ) cl_arg; int *ipiv; cppi_interface_t *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ]; cppi_interface_t *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1]; @@ -241,21 +313,19 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws; - int ib = tmp->ib; - int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3; - int task_num = 0; - void (*callback)(void*) = NULL; - int accessU, access_npiv, access_ipiv, access_ppiv; - struct cl_getrf_batched_args_t *clargs = *clargs_ptr; - int rankA = A->get_rankof(A, Am, An); #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_panel_blocked_batched: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws; + int ib = tmp->ib; + int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3; + int task_num = 0; + struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr; #if defined ( CHAMELEON_USE_MPI ) + int rankA = A->get_rankof(A, Am, An); if ( ( Am == An ) && ( h % ib == 0 ) && ( h > 0 ) ) { starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) ); @@ -277,8 +347,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, CHAMELEON_END_ACCESS_DECLARATION; if ( clargs == NULL ) { - clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ); - memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) ); + clargs = malloc( sizeof( struct cl_zgetrf_batched_args_s ) ); + memset( clargs, 0, sizeof( struct cl_zgetrf_batched_args_s ) ); clargs->tasks_nbr = 0; clargs->diag = ( Am == An ); clargs->ib = ib; @@ -300,47 +370,12 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); if ( clargs->tasks_nbr == batch_size ) { - access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; - access_ipiv = STARPU_RW; - access_ppiv = STARPU_R; - accessU = STARPU_RW; - if ( clargs->h == 0 ) { - access_ipiv = STARPU_W; - access_ppiv = STARPU_NONE; - accessU = STARPU_NONE; - } - else if ( clargs->h % clargs->ib == 0 ) { - accessU = STARPU_R; - } - else if ( clargs->h % clargs->ib == 1 ) { - accessU = STARPU_W; - } - /* If there isn't a diag task then use offdiag access */ - if ( clargs->diag == 0 ) { - accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; - access_ipiv = STARPU_NONE; - } - - rt_starpu_insert_task( - &cl_zgetrf_panel_blocked_batched, - /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), - STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, - access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, - STARPU_EXECUTE_ON_WORKER, options->workerid, - STARPU_NAME, clargs->cl_name, - 0 ); - - /* clargs is freed by starpu. */ - *clargs_ptr = NULL; + INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, An, U, Um, Un, clargs_ptr, ipiv ); } } +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, CHAM_desc_t *A, int An, @@ -348,15 +383,15 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - int accessU, access_npiv, access_ipiv, access_ppiv; - void (*callback)(void*) = NULL; - struct cl_getrf_batched_args_t *clargs = *clargs_ptr; - int rankA = A->myrank; #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_panel_blocked_batched: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + int accessU, access_npiv, access_ipiv, access_ppiv; + void (*callback)(void*) = NULL; + struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr; + int rankA = A->myrank; if ( clargs == NULL ) { return; @@ -386,7 +421,7 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zgetrf_panel_blocked_batched, /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_batched_args_s), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr, access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ), @@ -401,3 +436,91 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, /* clargs is freed by starpu. */ *clargs_ptr = NULL; } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void +INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options, + CHAM_desc_t *A, int An, + CHAM_desc_t *U, int Um, int Un, + void **clargs_ptr, + CHAM_ipiv_t *ipiv ) +{ + struct cl_zgetrf_batched_args_s *myclargs = *clargs_ptr; + int rankA = A->myrank; + int accessU, access_npiv, access_ipiv, access_ppiv, k; + int ret; + struct starpu_task *task; + + if ( myclargs == NULL ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_panel_blocked_batched, zgetrf_panel_blocked_batched, zgetrf_batched, myclargs->tasks_nbr + 4 ); + + access_npiv = ( myclargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ipiv = STARPU_RW; + access_ppiv = STARPU_R; + accessU = STARPU_RW; + if ( myclargs->h == 0 ) { + access_ipiv = STARPU_W; + access_ppiv = STARPU_NONE; + accessU = STARPU_NONE; + } + else if ( myclargs->h % myclargs->ib == 0 ) { + accessU = STARPU_R; + } + else if ( myclargs->h % myclargs->ib == 1 ) { + accessU = STARPU_W; + } + /* If there isn't a diag task then use offdiag access */ + if ( myclargs->diag == 0 ) { + accessU = ((myclargs->h%myclargs->ib == 0) && (myclargs->h > 0)) ? STARPU_R : STARPU_NONE; + access_ipiv = STARPU_NONE; + } + + /* + * Register the data handles, exchange needed only for U + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + for ( k = 0; k < myclargs->tasks_nbr; k++ ) { + starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW ); + } + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h-1 ), access_ppiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An), access_ipiv ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + accessU ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + task->cl_arg = myclargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_batched_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = flops_zgetrf_blocked_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->ib, + myclargs->diag, myclargs->tasks_nbr ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_panel_blocked_batched", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + (void)clargs; + (void)cl_name; +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c index 6bbc5f0c4d00574bb53f59538aa179d98cbf4cf7..f1df48f3cc7c3b6d460f859bc064e841bd4f5dc7 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -22,6 +22,16 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zgetrf_blocked_args_s { + int m; + int n; + int h; + int m0; + int ib; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; +}; + CHAMELEON_CL_CB( zgetrf_blocked_diag, cti_handle_get_m(task->handles[0]), 0, 0, M ) CHAMELEON_CL_CB( zgetrf_blocked_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M ) CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, 0, M ) @@ -29,9 +39,7 @@ CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, m0, ib; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileU; int *ipiv; @@ -40,9 +48,6 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1;; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, - &sequence, &request ); - tileA = cti_interface_get(descr[0]); ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); nextpiv = (cppi_interface_t*) descr[2]; @@ -53,10 +58,10 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) ldu = tileU->ld; } - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " ); } @@ -64,19 +69,19 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) * Make sure the nextpiv interface store the right information about the * column and diagonal row for the reduction */ - nextpiv->h = h; + nextpiv->h = clargs->h; nextpiv->has_diag = 1; coreblas_kernel_trace( tileA ); - CORE_zgetrf_panel_diag( m, n, h, m0, ib, + CORE_zgetrf_panel_diag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib, CHAM_tile_get_ptr( tileA ), tileA->ld, U, ldu, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " ); } } @@ -87,21 +92,22 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU( zgetrf_blocked_diag, cl_zgetrf_blocked_diag_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag; - void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; - const char *cl_name = "zgetrf_blocked_diag"; - int rankA = A->get_rankof(A, Am, An); #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; + const char *cl_name = "zgetrf_blocked_diag"; + int rankA = A->get_rankof(A, Am, An); #if defined ( CHAMELEON_USE_MPI ) if ( ( h % ib == 0 ) && ( h > 0 ) ) { @@ -113,6 +119,17 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, } #endif + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + int access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; @@ -139,19 +156,18 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + &cl_zgetrf_blocked_diag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -159,12 +175,97 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret, access_ipiv, access_npiv, access_ppiv, accessU; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + +#if defined ( CHAMELEON_USE_MPI ) + if ( ( h % ib == 0 ) && ( h > 0 ) ) { + starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) ); + } + + if ( rankA != A->myrank ) { + return; + } +#endif + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_diag, zgetrf_blocked_diag, zgetrf_blocked, 5 ); + + access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; + access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + accessU = STARPU_RW; + if ( h == 0 ) { + accessU = STARPU_NONE; + /* U can be set after ppiv because they are both none together, so it won't shift the buffers */ + } + else if ( h%ib == 0 ) { + accessU = STARPU_R; + } + else if ( ( h%ib == 1 ) || ( ib == 1 ) ) { + accessU = STARPU_W; + } + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An), access_ipiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), accessU ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_diag_callback ); + + /* Flops */ + task->flops = flops_zgetrf_blocked_diag( m, n, h, ib ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, k, m0, ib; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileU; cppi_interface_t *nextpiv; @@ -172,8 +273,6 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request ); - tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; prevpiv = (cppi_interface_t*) descr[2]; @@ -183,26 +282,26 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) ldu = tileU->ld; } - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " ); } - nextpiv->h = h; /* Initialize in case it uses a copy */ + nextpiv->h = clargs->h; /* Initialize in case it uses a copy */ nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); coreblas_kernel_trace( tileA ); - CORE_zgetrf_panel_offdiag( m, n, h, m0, ib, + CORE_zgetrf_panel_offdiag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib, CHAM_tile_get_ptr(tileA), tileA->ld, U, ldu, &(nextpiv->pivot), &(prevpiv->pivot) ); - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " ); } } @@ -213,23 +312,23 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_blocked_offdiag, cl_zgetrf_blocked_offdiag_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag; - - int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; - int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; - int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; - int rankA = A->get_rankof(A, Am, An); #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + int rankA = A->get_rankof(A, Am, An); #if defined ( CHAMELEON_USE_MPI ) if ( rankA != A->myrank ) { @@ -244,6 +343,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, } #endif + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL; const char *cl_name = "zgetrf_blocked_offdiag"; @@ -260,19 +370,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &An, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), + &cl_zgetrf_blocked_offdiag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -280,19 +388,96 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + +#if defined ( CHAMELEON_USE_MPI ) + if ( rankA != A->myrank ) { + if ( ( accessU != STARPU_NONE ) && + ( A->myrank == A->get_rankof( A, An, An ) ) ) + { + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + rankA, NULL, NULL ); + } + return; + } +#endif + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_offdiag, zgetrf_blocked_offdiag, zgetrf_blocked, 4 ); + + /* + * Register the data handles, exchange needed only for U + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR( U, ChamComplexDouble, Um, Un ), + accessU ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_offdiag_callback ); + + /* Flops */ + task->flops = flops_zgetrf_blocked_offdiag( m, n, h, ib ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_offdiag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, ib; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileU; cppi_interface_t *prevpiv; CHAMELEON_Complex64_t *U; int ldu; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &ib ); - tileU = cti_interface_get(descr[0]); prevpiv = (cppi_interface_t*) descr[1]; U = CHAM_tile_get_ptr( tileU ); @@ -301,16 +486,16 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) coreblas_kernel_trace( tileU ); /* Copy the final max line of the block and solve */ - cblas_zcopy( n, prevpiv->pivot.pivrow, 1, - U + m - 1, ldu ); + cblas_zcopy( clargs->n, prevpiv->pivot.pivrow, 1, + U + clargs->m - 1, ldu ); - if ( (n-h) > 0 ) { + if ( ( clargs->n - clargs->h ) > 0 ) { cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, - ib, n - h, - CBLAS_SADDR(zone), U + (h-ib) * ldu, ldu, - U + h * ldu, ldu ); + clargs->ib, clargs->n - clargs->h, + CBLAS_SADDR(zone), U + (clargs->h-clargs->ib) * ldu, ldu, + U + clargs->h * ldu, ldu ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -320,13 +505,13 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_blocked_trsm, cl_zgetrf_blocked_trsm_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, int m, int n, int h, int ib, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_trsm; - void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL; const char *cl_name = "zgetrf_blocked_trsm"; int rankU = U->get_rankof(U, Um, Un); @@ -344,17 +529,86 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, return; } + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->ib = ib; + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), + &cl_zgetrf_blocked_trsm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), STARPU_R, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_NAME, cl_name, 0 ); } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret; + struct starpu_task *task; + int rankU = U->get_rankof(U, Um, Un); + + if ( U->myrank != rankU ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_trsm, zgetrf_blocked_trsm, zgetrf_blocked, 2 ); + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankU ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), STARPU_R ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->ib = ib; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_trsm_callback ); + + /* Flops */ + task->flops = flops_zgetrf_trsm( m, n, h, ib ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, U->get_blktile( U, Um, Un ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index 28f93240c795ddf781910bcc70dbc5fdd531dc22..bedb371858a13eec4cd30511310469ae037db9a4 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -26,28 +26,31 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zgetrf_nopiv_args_s { + int m; + int n; + int ib; + int iinfo; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; +}; + /* * Codelet CPU */ #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) { - int m; - int n; - int ib; + struct cl_zgetrf_nopiv_args_s *clargs = (struct cl_zgetrf_nopiv_args_s *)cl_arg; CHAM_tile_t *tileA; - int iinfo; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &iinfo, &sequence, &request); - TCORE_zgetrf_nopiv(m, n, ib, tileA, &info); + TCORE_zgetrf_nopiv( clargs->m, clargs->n, clargs->ib, tileA, &info ); - if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); + if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -57,30 +60,121 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_nopiv, cl_zgetrf_nopiv_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, const CHAM_desc_t *A, int Am, int An, int iinfo) { - (void)nb; - struct starpu_codelet *codelet = &cl_zgetrf_nopiv; - void (*callback)(void*) = options->profiling ? cl_zgetrf_nopiv_callback : NULL; + void (*callback)(void*); + struct cl_zgetrf_nopiv_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zgetrf_nopiv"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + /* Set codelet parameters */ + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgetrf_nopiv_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->ib = ib; + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zgetrf_nopiv_callback : NULL; + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 1, + A->get_blktile( A, Am, An ) ); + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_VALUE, &iinfo, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zgetrf_nopiv, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_nopiv_args_s), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_NAME, cl_name, 0 ); + + (void)nb; } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, + int m, int n, int ib, int nb, + const CHAM_desc_t *A, int Am, int An, + int iinfo) +{ + INSERT_TASK_COMMON_PARAMETERS( zgetrf_nopiv, 1 ); + + /* + * Register the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, A->get_rankof( A, Am, An ) ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_RW ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_nopiv_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->ib = ib; + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_nopiv_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_nopiv_callback ); + + /* Flops */ + task->flops = flops_zgetrf( m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, + A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_nopiv", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c index 1ee38e8a12f0bb095ca65636b7569f1ee98c20af..9a0ec048b78b68569974267edb5c62aa97ce65d2 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_percol.c +++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c @@ -22,31 +22,36 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zgetrf_percol_args_s { + int m; + int n; + int h; + int m0; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; +}; + CHAMELEON_CL_CB( zgetrf_percol_diag, cti_handle_get_m(task->handles[0]), 0, 0, M ) CHAMELEON_CL_CB( zgetrf_percol_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M ) #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, m0; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_percol_args_s *clargs = (struct cl_zgetrf_percol_args_s *)cl_arg; CHAM_tile_t *tileA; int *ipiv; cppi_interface_t *nextpiv; cppi_interface_t *prevpiv; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request ); - tileA = cti_interface_get(descr[0]); ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); nextpiv = (cppi_interface_t*) descr[2]; prevpiv = (cppi_interface_t*) descr[3]; - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " ); } @@ -54,18 +59,18 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) * Make sure the nextpiv interface store the right information about the * column and diagonal row for the reduction */ - nextpiv->h = h; + nextpiv->h = clargs->h; nextpiv->has_diag = 1; - CORE_zgetrf_panel_diag( m, n, h, m0, tileA->n, + CORE_zgetrf_panel_diag( clargs->m, clargs->n, clargs->h, clargs->m0, tileA->n, CHAM_tile_get_ptr( tileA ), tileA->ld, NULL, -1, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " ); } - if ( h < n ) { + if ( clargs->h < clargs->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " ); } } @@ -76,12 +81,13 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU( zgetrf_percol_diag, cl_zgetrf_percol_diag_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_percol_diag; void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_diag_callback : NULL; const char *cl_name = "zgetrf_percol_diag"; int rankA = A->get_rankof(A, Am, An); @@ -105,21 +111,31 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, CHAMELEON_ACCESS_RW( A, Am, An ); CHAMELEON_END_ACCESS_DECLARATION; + /* Set codelet parameters */ + struct cl_zgetrf_percol_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->sequence = options->sequence; + clargs->request = options->request; + /* Refine name */ cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + &cl_zgetrf_percol_diag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_percol_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -127,26 +143,89 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + CHAM_desc_t *A, int Am, int An, + CHAM_ipiv_t *ipiv ) +{ + int ret, access_ipiv, access_npiv, access_ppiv; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + + if ( rankA != A->myrank ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_percol_diag, zgetrf_percol_diag, zgetrf_percol, 4 ); + + access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; + access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An), access_ipiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_percol_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_diag_callback ); + + /* Flops */ + task->flops = flops_zgetrf_percol_diag( m, n, h ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, m0; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_percol_args_s *clargs = (struct cl_zgetrf_percol_args_s *)cl_arg; CHAM_tile_t *tileA; cppi_interface_t *nextpiv; cppi_interface_t *prevpiv; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request ); - tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; prevpiv = (cppi_interface_t*) descr[2]; - nextpiv->h = h; /* Initialize in case it uses a copy */ + nextpiv->h = clargs->h; /* Initialize in case it uses a copy */ nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); - CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n, + CORE_zgetrf_panel_offdiag( clargs->m, clargs->n, clargs->h, clargs->m0, tileA->n, CHAM_tile_get_ptr(tileA), tileA->ld, NULL, -1, &(nextpiv->pivot), &(prevpiv->pivot) ); @@ -158,13 +237,13 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_percol_offdiag, cl_zgetrf_percol_offdiag_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, int m, int n, int h, int m0, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_percol_offdiag; - void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_offdiag_callback : NULL; const char *cl_name = "zgetrf_percol_offdiag"; int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; @@ -185,23 +264,98 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, CHAMELEON_ACCESS_RW( A, Am, An ); CHAMELEON_END_ACCESS_DECLARATION; + /* Set codelet parameters */ + struct cl_zgetrf_percol_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->sequence = options->sequence; + clargs->request = options->request; + /* Refine name */ cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), + &cl_zgetrf_percol_offdiag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_percol_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_NAME, cl_name, 0 ); } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, + CHAM_desc_t *A, int Am, int An, + CHAM_ipiv_t *ipiv ) +{ + int ret, access_npiv, access_ppiv; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + + if ( rankA != A->myrank ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_percol_offdiag, zgetrf_percol_offdiag, zgetrf_percol, 3 ); + + access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_percol_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_offdiag_callback ); + + /* Flops */ + task->flops = flops_zgetrf_percol_offdiag( m, n, h ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 2ce1c3a5a5fe2501683f53856e632b5bd974c098..6584684a40f43cb40417458dc69d4591aa8730f4 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -126,6 +126,7 @@ CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, @@ -274,7 +275,7 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, 0 ); } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void __INSERT_TASK_zhemm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, @@ -295,9 +296,9 @@ void __INSERT_TASK_zhemm( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, xrank ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -391,4 +392,5 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, B, Bm, Bn, beta, C, Cm, Cn ); } -#endif + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 04c5354996e62e3f70b2ed908124ae0a88eeec69..455427bc90ad26d5a52a322d0617256e80964824 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -108,6 +108,7 @@ CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -175,7 +176,7 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, @@ -200,9 +201,9 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -256,4 +257,4 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 7ea5448cad03886644367b3b641f2ffaa9b665e8..beba72bf4cb5cfd5c6feda024c71fd9dbed7db58 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -110,6 +110,7 @@ CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -175,7 +176,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, @@ -199,8 +200,8 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -253,4 +254,4 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c index a3d4d21441da1c09f9eb2f6b6983deb95550538c..e79a1841d491524d67ffd3c398bba8efe5706690 100644 --- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -20,7 +20,7 @@ #include "runtime_codelet_z.h" #if defined(CHAMELEON_USE_MPI) -struct cl_redux_args_t { +struct cl_redux_args_s { int h; int n; }; @@ -55,7 +55,11 @@ zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me, ( cppi_me->has_diag == -1 ) ) { cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); - assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n ); + /* + * The interface of the pivot is registered once in starpu so + * the arraysize is not always correct + */ + assert( cppi_src->arraysize >= sizeof(CHAMELEON_Complex64_t) * n ); cppi_me->has_diag = 1; } @@ -65,7 +69,7 @@ zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me, static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) { - struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + struct cl_redux_args_s *clargs = (struct cl_redux_args_s *) cl_arg; cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); zipiv_allreduce_cpu_func( cppi_me, cppi_src, clargs->h, clargs->n ); @@ -73,6 +77,8 @@ cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) /* defined(CHAMELEON_STARPU_USE_INSERT) */ + static void INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv, @@ -98,14 +104,14 @@ INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, int h, int n ) { - struct cl_redux_args_t *clargs; - clargs = malloc( sizeof( struct cl_redux_args_t ) ); + struct cl_redux_args_s *clargs; + clargs = malloc( sizeof( struct cl_redux_args_s ) ); clargs->h = h; clargs->n = n; rt_starpu_insert_task( &cl_zipiv_allreduce, - STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_t), + STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_s), STARPU_RW, RUNTIME_pivot_getaddr( ipiv, me, k, h ), STARPU_R, RUNTIME_pivot_getaddr( ipiv, src, k, h ), STARPU_EXECUTE_ON_NODE, me, @@ -115,6 +121,83 @@ INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +static void +INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h ) +{ + INSERT_TASK_COMMON_PARAMETERS_CLNULL( zipiv_allreduce_send, 1 ) + + starpu_cham_exchange_init_params( options, ¶ms, dst ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_R ); + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + (void)cl; + (void)cl_name; +} + +static void +INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n ) +{ + int ret; + struct starpu_task *task; + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zipiv_allreduce_recv, zipiv_allreduce, redux, 2 ) + + starpu_cham_exchange_init_params( options, ¶ms, me ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_pivot_getaddr( ipiv, me, k, h ), + STARPU_RW ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_pivot_getaddr( ipiv, src, k, h ), + STARPU_R ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_redux_args_s ) ); + clargs->h = h; + clargs->n = n; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_redux_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + /* Refine name */ + task->name = cl_name; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zipiv_allreduce", "Failed to submit the task to StarPU" ); + return; + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + static void zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, CHAM_desc_t *A, diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index 7478942fe3fe48b1bceeb0e9474243ac016cce8f..b9ddb9a8c83659dea3a4bddc6abea31180a57cf6 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -50,6 +50,7 @@ cl_zlascal_cpu_func( void *descr[], void *cl_arg ) CODELETS_CPU( zlascal, cl_zlascal_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zlascal( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, @@ -104,7 +105,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, cham_uplo_t uplo, @@ -127,7 +128,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, A->get_rankof( A, Am, An ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_RW ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_RW ); /* * Not involved, let's return @@ -177,4 +178,4 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c index 2c77f07f3d3e41d0df3402aff447d0ca55e06f2d..81c28d92f05d6c23e85e743b8402b79db31815b1 100644 --- a/runtime/starpu/codelets/codelet_zlaswp.c +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -20,19 +20,23 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zlaswp_args_s { + int m0; + int k; +}; + #if !defined(CHAMELEON_SIMULATION) static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg ) { - int m0, k, *perm; + struct cl_zlaswp_args_s *clargs = (struct cl_zlaswp_args_s *)cl_arg; + int *perm; CHAM_tile_t *A, *B; - starpu_codelet_unpack_args( cl_arg, &m0, &k ); - perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); A = (CHAM_tile_t *) cti_interface_get( descr[1] ); B = (CHAM_tile_t *) cti_interface_get( descr[2] ); - TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm ); + TCORE_zlaswp_get( clargs->m0, A->m, A->n, clargs->k, A, B, perm ); } #endif @@ -41,6 +45,8 @@ static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg ) */ CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, @@ -52,12 +58,16 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, return; } + struct cl_zlaswp_args_s *clargs; + clargs = malloc( sizeof( struct cl_zlaswp_args_s ) ); + clargs->m0 = m0; + clargs->k = k; + //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL; rt_starpu_insert_task( codelet, - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &k, sizeof(int), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), @@ -67,19 +77,77 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *U, int Um, int Un ) +{ + int ret; + struct starpu_task *task; + + if ( A->get_rankof( A, Am, An) != A->myrank ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_get, zlaswp_get, zlaswp, 3); + + /* + * Register the data handles, might need to receive perm and invp + */ + starpu_cham_exchange_init_params( options, ¶ms, U->get_rankof( U, Um, Un ) ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), + STARPU_RW | STARPU_COMMUTE ); + + task = starpu_task_create(); + task->cl = cl; + + clargs = malloc( sizeof( struct cl_zlaswp_args_s ) ); + clargs->m0 = m0; + clargs->k = k; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zlaswp_args_s ); + task->cl_arg_free = 1; + + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + /* Refine name */ + task->name = cl_name; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zlaswp_get", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg ) { - int m0, k, *invp; + struct cl_zlaswp_args_s *clargs = (struct cl_zlaswp_args_s *)cl_arg; + int *invp; CHAM_tile_t *A, *B; - starpu_codelet_unpack_args( cl_arg, &m0, &k ); - invp = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); A = (CHAM_tile_t *) cti_interface_get( descr[1] ); B = (CHAM_tile_t *) cti_interface_get( descr[2] ); - TCORE_zlaswp_set( m0, B->m, B->n, k, A, B, invp ); + TCORE_zlaswp_set( clargs->m0, B->m, B->n, clargs->k, A, B, invp ); } #endif @@ -88,6 +156,7 @@ static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg ) */ CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, @@ -95,16 +164,20 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_zlaswp_set; - if ( A->get_rankof( B, Bm, Bn) != A->myrank ) { + if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { return; } + struct cl_zlaswp_args_s *clargs; + clargs = malloc( sizeof( struct cl_zlaswp_args_s ) ); + clargs->m0 = m0; + clargs->k = k; + //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL; rt_starpu_insert_task( codelet, - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &k, sizeof(int), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), @@ -113,3 +186,60 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, STARPU_EXECUTE_ON_WORKER, options->workerid, 0 ); } +#else +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, + int m0, int k, + const CHAM_ipiv_t *ipiv, int ipivk, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + int ret; + struct starpu_task *task; + + if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_set, zlaswp_set, zlaswp, 3); + + /* + * Register the data handles, might need to receive perm and invp + */ + starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( B, ChamComplexDouble, Bm, Bn ), STARPU_RW ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zlaswp_args_s ) ); + clargs->m0 = m0; + clargs->k = k; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zlaswp_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + /* Refine name */ + task->name = cl_name; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zlaswp_set", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} +#endif diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c index f43a68947a9798d15377bb460a54998f208da898..8cc2a3adc593c698f3d79163781f44bd59b92d6e 100644 --- a/runtime/starpu/codelets/codelet_zlaswp_batched.c +++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c @@ -18,7 +18,7 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" -struct cl_laswp_batched_args_t { +struct cl_zlaswp_batched_args_s { int tasks_nbr; int minmn; int m0[CHAMELEON_BATCH_SIZE]; @@ -32,7 +32,7 @@ cl_zlaswp_batched_cpu_func( void *descr[], { int i, m0, minmn, *perm, *invp; CHAM_tile_t *A, *U, *B; - struct cl_laswp_batched_args_t *clargs = ( struct cl_laswp_batched_args_t * ) cl_arg; + struct cl_zlaswp_batched_args_s *clargs = ( struct cl_zlaswp_batched_args_s * ) cl_arg; minmn = clargs->minmn; perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] ); @@ -73,14 +73,13 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, { int task_num = 0; int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap; - int nhandles; - struct cl_laswp_batched_args_t *clargs = *clargs_ptr; + struct cl_zlaswp_batched_args_s *clargs = *clargs_ptr; if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) { return; } if( clargs == NULL ) { - clargs = malloc( sizeof( struct cl_laswp_batched_args_t ) ) ; + clargs = malloc( sizeof( struct cl_zlaswp_batched_args_s ) ) ; clargs->tasks_nbr = 0; clargs->minmn = minmn; *clargs_ptr = clargs; @@ -93,24 +92,12 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, clargs->tasks_nbr ++; if ( clargs->tasks_nbr == batch_size ) { - nhandles = clargs->tasks_nbr; - rt_starpu_insert_task( - &cl_zlaswp_batched, - STARPU_CL_ARGS, clargs, sizeof(struct cl_laswp_batched_args_t), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), - STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), - STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), - STARPU_R, RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn), - STARPU_DATA_MODE_ARRAY, clargs->handle_mode, nhandles, - STARPU_PRIORITY, options->priority, - STARPU_EXECUTE_ON_WORKER, options->workerid, - 0 ); - - /* clargs is freed by starpu. */ - *clargs_ptr = NULL; + INSERT_TASK_zlaswp_batched_flush( options, ipiv, ipivk, Ak, Akm, Akn, U, Um, Un, clargs_ptr ); } } +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, const CHAM_ipiv_t *ipiv, int ipivk, @@ -122,7 +109,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, int Un, void **clargs_ptr ) { - struct cl_laswp_batched_args_t *clargs = *clargs_ptr; + struct cl_zlaswp_batched_args_s *clargs = *clargs_ptr; int nhandles; if( clargs == NULL ) { @@ -132,7 +119,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, nhandles = clargs->tasks_nbr; rt_starpu_insert_task( &cl_zlaswp_batched, - STARPU_CL_ARGS, clargs, sizeof(struct cl_laswp_batched_args_t), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_batched_args_s), STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), @@ -145,3 +132,74 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, /* clargs is freed by starpu. */ *clargs_ptr = NULL; } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, + const CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, + void **clargs_ptr ) +{ + int ret, k; + struct starpu_task *task; + struct cl_zlaswp_batched_args_s *myclargs = *clargs_ptr; + + if( myclargs == NULL ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zlaswp_batched, myclargs->tasks_nbr + 4 ); + + /* + * Register the data handles, might need to receive perm and invp + */ + starpu_cham_exchange_init_params( options, ¶ms, Ak->myrank ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), + STARPU_RW | STARPU_COMMUTE ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( Ak, ChamComplexDouble, Akm, Akn ), STARPU_R ); + for ( k = 0; k < myclargs->tasks_nbr; k++ ) { + starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW ); + } + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + task->cl_arg = myclargs; + task->cl_arg_size = sizeof( struct cl_zlaswp_batched_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zlaswp_batched", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + /* clargs is freed by starpu. */ + *clargs_ptr = NULL; + (void)clargs; + (void)cl_name; +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c index 93bd984ab215224b751a325f775ec85e422df1a1..e32b7ad9c46a2303eb1c4c6a18d442935fca6d3a 100644 --- a/runtime/starpu/codelets/codelet_zperm_allreduce.c +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -21,8 +21,9 @@ #include <coreblas/cblas_wrapper.h> #if defined(CHAMELEON_USE_MPI) -struct cl_redux_args_t { +struct cl_redux_args_s { int tempmm; + int mb; int n; int p; int q; @@ -35,7 +36,7 @@ struct cl_redux_args_t { static void cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg ) { - struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + struct cl_redux_args_s *clargs = (struct cl_redux_args_s *) cl_arg; const CHAM_tile_t *tileUinout = cti_interface_get( descr[0] ); const CHAM_tile_t *tileUin = cti_interface_get( descr[1] ); const int *perm = (int *)STARPU_VECTOR_GET_PTR( descr[2] ); @@ -43,6 +44,7 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg ) const CHAMELEON_Complex64_t *Uin = CHAM_tile_get_ptr( tileUin ); int tempmm = clargs->tempmm; + int mb = clargs->mb; int n = clargs->n; int p = clargs->p; int q = clargs->q; @@ -51,7 +53,6 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg ) int np = clargs->np_inv; int me = ( p <= np ) ? clargs->me / q : ( ( clargs->me / q ) - p_first + p ) % p; int nb = tileUinout->n; - int mb = tileUinout->m; int first = me - 2 * shift + 1; int last = me - shift; int i, m, ownerp; @@ -71,6 +72,8 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg ) CODELETS_CPU( zperm_allreduce, cl_zperm_allreduce_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) + static void INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, CHAM_desc_t *U, @@ -101,9 +104,10 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, int np, int p_first ) { - struct cl_redux_args_t *clargs; - clargs = malloc( sizeof( struct cl_redux_args_t ) ); + struct cl_redux_args_s *clargs; + clargs = malloc( sizeof( struct cl_redux_args_s ) ); clargs->tempmm = tempmm; + clargs->mb = U->mb; clargs->n = n; clargs->p = p; clargs->q = q; @@ -114,7 +118,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zperm_allreduce, - STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_t), + STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_s), STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, me, n), STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n), STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), @@ -125,6 +129,96 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +static void +INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, + CHAM_desc_t *U, + int me, + int dst, + int n ) +{ + INSERT_TASK_COMMON_PARAMETERS_CLNULL( zperm_allreduce_send, 1 ); + + starpu_cham_exchange_init_params( options, ¶ms, dst ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR( U, ChamComplexDouble, me, n ), + STARPU_R ); + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + (void)cl; + (void)cl_name; +} + +static void +INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + CHAM_desc_t *U, + CHAM_ipiv_t *ipiv, + int ipivk, + int me, + int src, + int n, + int tempmm, + int p, + int q, + int shift, + int np, + int p_first ) +{ + int ret; + struct starpu_task *task; + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zperm_allreduce_send, zperm_allreduce, redux, 3 ); + + starpu_cham_exchange_init_params( options, ¶ms, me ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR( U, ChamComplexDouble, me, n ), + STARPU_RW ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR( U, ChamComplexDouble, src, n ), + STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_redux_args_s ) ); + clargs->tempmm = tempmm; + clargs->mb = U->mb; + clargs->n = n; + clargs->p = p; + clargs->q = q; + clargs->p_first = p_first; + clargs->me = me; + clargs->shift = shift; + clargs->np_inv = np; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_redux_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, NULL ); + + /* Flops */ + task->flops = 0.; + + /* Refine name */ + task->name = cl_name; + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zperm_allreduce", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + static void zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, const CHAM_desc_t *A, diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index fb83eccc0d65e348efa665eb4f84df5563e246a4..80be1490512d9cc63ddfc6a2f9d98cd6d9f92585 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -66,6 +66,7 @@ CODELETS_CPU( zpotrf, cl_zpotrf_cpu_func ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, const CHAM_desc_t *A, int Am, int An, @@ -116,7 +117,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, @@ -129,7 +130,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, A->get_rankof( A, Am, An ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_RW ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_RW ); /* * Not involved, let's return @@ -180,4 +181,4 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 4298cd8dd3716c1d112949eaa6689cbacad68a37..c715baff60f576206889892a9d6e07263041c9a1 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -127,6 +127,7 @@ CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, @@ -275,7 +276,7 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, 0 ); } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void __INSERT_TASK_zsymm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, @@ -296,9 +297,9 @@ void __INSERT_TASK_zsymm( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, xrank ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -392,4 +393,5 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, B, Bm, Bn, beta, C, Cm, Cn ); } -#endif + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 6ab60fbe5831fc58ccb18c8b1c3d8346ef49ca47..145b926466b2758ada8efd859bc1924bff52ad8e 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -109,6 +109,7 @@ CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -176,7 +177,7 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, @@ -201,9 +202,9 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -257,4 +258,4 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index d2ea231706b2409f5c440986f0af2175dcb40390..722aa51729022207759258c583bc6c9363851dcd 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -111,6 +111,7 @@ CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -176,7 +177,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, @@ -200,8 +201,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, C, Cm, Cn, accessC ); /* * Not involved, let's return @@ -254,4 +255,4 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index 3c9786734efe00dd355871336ab954922a4e0fcb..f88fdae60663dd49d12a53e4d5a822d466e603b2 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -54,6 +54,7 @@ cl_ztradd_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU( ztradd, cl_ztradd_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, @@ -112,7 +113,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, @@ -136,8 +137,8 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, accessB ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, accessB ); /* * Not involved, let's return @@ -190,4 +191,4 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index bafef35505b4bed91e0ef7f268102589ed4a3fec..c44697a83b9c562d722df59c6328bffd53ef797a 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -107,6 +107,7 @@ CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, @@ -162,7 +163,7 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, @@ -176,8 +177,8 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); /* * Not involved, let's return @@ -231,4 +232,4 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 70fdf057fa3e087bfe9e0d4df61b8727c7aae492..719ce05c3369c4433976554e105ea5a18da73872 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -114,6 +114,7 @@ CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC ) #endif #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, @@ -170,7 +171,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, (void)nb; } -#else +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, @@ -184,8 +185,8 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, * Set the data handles and initialize exchanges if needed */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); - starpu_cham_exchange_data_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_tile_before_execution( options, ¶ms, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); /* * Not involved, let's return @@ -239,4 +240,4 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, (void)nb; } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ diff --git a/runtime/starpu/include/chameleon_starpu_internal.h b/runtime/starpu/include/chameleon_starpu_internal.h index c202955a80a5454fdb7c64bbb1f9db5ccd29db17..1100baeeb5a5065d5565697187162aa974c727ac 100644 --- a/runtime/starpu/include/chameleon_starpu_internal.h +++ b/runtime/starpu/include/chameleon_starpu_internal.h @@ -226,11 +226,65 @@ chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n) const char *cl_name = #_name_; \ int nbdata = 0; +#define INSERT_TASK_COMMON_PARAMETERS_EXTENDED( _name_task_, _name_cl_, _name_arg_, _nbuffer_ ) \ + struct starpu_data_descr descrs[_nbuffer_]; \ + struct starpu_mpi_task_exchange_params params; \ + struct cl_##_name_arg_##_args_s *clargs = NULL; \ + struct starpu_codelet *cl = &cl_##_name_cl_; \ + const char *cl_name = #_name_task_; \ + int nbdata = 0; + +#define INSERT_TASK_COMMON_PARAMETERS_CLNULL( _name_, _nbuffer_ ) \ + struct starpu_data_descr descrs[_nbuffer_]; \ + struct starpu_mpi_task_exchange_params params; \ + struct starpu_codelet *cl = NULL; \ + const char *cl_name = #_name_; \ + int nbdata = 0; + /** * This section defines the codelet functions to manage MPI cache and data * echanges before and after submitting tasks */ #if !defined(CHAMELEON_STARPU_USE_INSERT) + +/** + * @brief Internal function to initialize the StarPU paramas structure. + * + * @param[in,out] nbdata + * On entry the number of data already registered in descrs. On exist, + * the counter is updated if the next handle is registered in the + * structure. + * + * @param[in,out] descrs + * The array of starpu data descriptors (handle + mode). On entry, it + * is allcoated to the maximum number of data for the task, and + * contains the already registered nbdata handles and their associated + * modes. On exit, it is updated with the new handle if needed. + * + * @param[in] handle + * The data handle + * + * @param[in] mode + * The access mode + * + */ +static inline void +starpu_cham_register_descr( int *nbdata, + struct starpu_data_descr *descrs, + starpu_data_handle_t handle, + enum starpu_data_access_mode mode ) +{ + if ( mode & STARPU_NONE ) { + return; + } + + descrs[*nbdata].handle = handle; + descrs[*nbdata].mode = mode; + + (*nbdata)++; + return; +} + #if !defined(CHAMELEON_USE_MPI) /** @@ -251,7 +305,22 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t *option } static inline void -starpu_cham_exchange_data_before_execution( const RUNTIME_option_t *options, +starpu_cham_exchange_handle_before_execution( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params *params, + int *nbdata, + struct starpu_data_descr *descrs, + starpu_data_handle_t handle, + enum starpu_data_access_mode mode ) +{ + starpu_cham_register_descr( nbdata, descrs, handle, mode ); + + (void)options; + (void)params; + return; +} + +static inline void +starpu_cham_exchange_tile_before_execution( const RUNTIME_option_t *options, struct starpu_mpi_task_exchange_params *params, int *nbdata, struct starpu_data_descr *descrs, @@ -260,9 +329,7 @@ starpu_cham_exchange_data_before_execution( const RUNTIME_option_t int An, enum starpu_data_access_mode mode ) { - descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An ); - descrs[*nbdata].mode = mode; - (*nbdata)++; + starpu_cham_register_descr( nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), mode ); (void)options; (void)params; @@ -301,6 +368,56 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t *option params->exchange_needed = 0; } +/** + * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(), + * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros + * in other runtimes. + * + * @param[in] options + * The options to parameterize the task + * + * @param[in] params + * The starpu parameters for the exchange functions. Needs to be + * initialized by starpu_cham_init_exchange_param() function. + * + * @param[in,out] nbdata + * On entry the number of data already registered in descrs. On exist, + * the counter is updated if the next handle is registered in the + * structure. + * + * @param[in,out] descrs + * The array of starpu data descriptors (handle + mode). On entry, it + * is allcoated to the maximum number of data for the task, and + * contains the already registered nbdata handles and their associated + * modes. On exit, it is updated with the new handle if needed. + * + * @param[in] handle + * The data handle + * + * @param[in] mode + * The access mode + * + */ +static inline void +starpu_cham_exchange_handle_before_execution( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params *params, + int *nbdata, + struct starpu_data_descr *descrs, + starpu_data_handle_t handle, + enum starpu_data_access_mode mode ) +{ + if ( mode & STARPU_NONE ) { + return; + } + + starpu_cham_register_descr( nbdata, descrs, handle, mode ); + + starpu_mpi_exchange_data_before_execution( options->sequence->comm, + handle, mode, params ); + + return; +} + /** * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(), * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros @@ -338,7 +455,7 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t *option * */ static inline void -starpu_cham_exchange_data_before_execution( const RUNTIME_option_t *options, +starpu_cham_exchange_tile_before_execution( const RUNTIME_option_t *options, struct starpu_mpi_task_exchange_params *params, int *nbdata, struct starpu_data_descr *descrs, @@ -377,13 +494,8 @@ starpu_cham_exchange_data_before_execution( const RUNTIME_option_t * If we need to submit, let's create the data handle and ask StarPU to perform * the necessary communications */ - descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An ); - descrs[*nbdata].mode = mode; - - starpu_mpi_exchange_data_before_execution( - options->sequence->comm, descrs[*nbdata].handle, mode, params ); - - (*nbdata)++; + starpu_cham_exchange_handle_before_execution( options, params, nbdata, descrs, + RTBLKADDR( A, ChamComplexDouble, Am, An ), mode ); return; } @@ -447,6 +559,7 @@ starpu_cham_task_set_options( const RUNTIME_option_t *options, struct starpu_data_descr *descrs, callback_fct_t callback ) { + int allocated_buffers = 0; int i; task->priority = options->priority; @@ -473,6 +586,10 @@ starpu_cham_task_set_options( const RUNTIME_option_t *options, // task->where; /* Do restriction here */ task->nbuffers = nbdata; + + /* Dynamic handles */ + starpu_task_insert_data_make_room( task->cl, task, &allocated_buffers, 0, task->nbuffers ); + for ( i = 0; i < task->nbuffers; i++ ) { enum starpu_data_access_mode mode = descrs[i].mode; assert( descrs[i].handle ); diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index c8d012141de283ea45c4ebe3a4ae3f9270d7e435..39b7e89e04daf060dcacc87d17b851ee83e3191d 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -94,26 +94,28 @@ if (NOT CHAMELEON_SIMULATION) endforeach() if ( CHAMELEON_SCHED_STARPU ) - add_test( test_${cat}_${prec}getrf_nopivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 --diag=ChamUnit -f input/getrf_nopiv.in ) - set_tests_properties( test_${cat}_${prec}getrf_nopivpercol - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" ) - if ( HAVE_STARPU_NONE_NONZERO ) - add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) + add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppivpercol - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" ) + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" ) - add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in ) + add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" ) + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=3" ) + + add_test( test_${cat}_${prec}getrf_ppivblocked ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppivblocked + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" ) - add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) - set_tests_properties( test_${cat}_${prec}getrf_ppiv - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" ) + add_test( test_${cat}_${prec}getrf_ppivblocked_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppivblocked_batch + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=3" ) - add_test( test_${cat}_${prec}getrf_ppiv_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in ) - set_tests_properties( test_${cat}_${prec}getrf_ppiv_batch - PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" ) + if ( ${cat} STREQUAL "mpi" ) + add_test( test_${cat}_${prec}getrf_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/getrf.in ) + set_tests_properties( test_${cat}_${prec}getrf_ppiv_comm_with_task + PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0;CHAMELEON_GETRF_ALL_REDUCE=cham_spu_tasks" ) + endif() endif() endif() diff --git a/testing/input/getrf.in b/testing/input/getrf.in index c0c99c52c69b54daa2f915d3fb636ad805b88b84..fec8a04f08f6b357e3e609b7f4421ff83281be47 100644 --- a/testing/input/getrf.in +++ b/testing/input/getrf.in @@ -11,6 +11,6 @@ op = getrf nb = 16, 17 ib = 16, 5 -m = 13, 17, 35 -n = 15, 19, 33 -lda = 41 +m = 13, 17, 35, 130 +n = 15, 19, 33, 115 +lda = 131