diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c index 6bbc5f0c4d00574bb53f59538aa179d98cbf4cf7..63ccf116a5ac1b1871d8a9c7aad78dc98af3bca3 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -22,6 +22,16 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zgetrf_blocked_args_s { + int m; + int n; + int h; + int m0; + int ib; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; +}; + CHAMELEON_CL_CB( zgetrf_blocked_diag, cti_handle_get_m(task->handles[0]), 0, 0, M ) CHAMELEON_CL_CB( zgetrf_blocked_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M ) CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, 0, M ) @@ -29,9 +39,7 @@ CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, m0, ib; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileU; int *ipiv; @@ -40,9 +48,6 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1;; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib, - &sequence, &request ); - tileA = cti_interface_get(descr[0]); ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); nextpiv = (cppi_interface_t*) descr[2]; @@ -53,10 +58,10 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) ldu = tileU->ld; } - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " ); } @@ -64,19 +69,19 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) * Make sure the nextpiv interface store the right information about the * column and diagonal row for the reduction */ - nextpiv->h = h; + nextpiv->h = clargs->h; nextpiv->has_diag = 1; coreblas_kernel_trace( tileA ); - CORE_zgetrf_panel_diag( m, n, h, m0, ib, + CORE_zgetrf_panel_diag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib, CHAM_tile_get_ptr( tileA ), tileA->ld, U, ldu, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " ); } } @@ -87,21 +92,22 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU( zgetrf_blocked_diag, cl_zgetrf_blocked_diag_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag; - void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; - const char *cl_name = "zgetrf_blocked_diag"; - int rankA = A->get_rankof(A, Am, An); #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; + const char *cl_name = "zgetrf_blocked_diag"; + int rankA = A->get_rankof(A, Am, An); #if defined ( CHAMELEON_USE_MPI ) if ( ( h % ib == 0 ) && ( h > 0 ) ) { @@ -113,6 +119,17 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, } #endif + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + int access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; @@ -139,19 +156,18 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + &cl_zgetrf_blocked_diag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -159,12 +175,97 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret, access_ipiv, access_npiv, access_ppiv, accessU; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + +#if defined ( CHAMELEON_USE_MPI ) + if ( ( h % ib == 0 ) && ( h > 0 ) ) { + starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) ); + } + + if ( rankA != A->myrank ) { + return; + } +#endif + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_diag, zgetrf_blocked_diag, zgetrf_blocked, 5 ); + + access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; + access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + accessU = STARPU_RW; + if ( h == 0 ) { + accessU = STARPU_NONE; + /* U can be set after ppiv because they are both none together, so it won't shift the buffers */ + } + else if ( h%ib == 0 ) { + accessU = STARPU_R; + } + else if ( ( h%ib == 1 ) || ( ib == 1 ) ) { + accessU = STARPU_W; + } + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An), access_ipiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), accessU ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_diag_callback ); + + /* Flops */ + // task->flops = TODO; + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, k, m0, ib; - RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileU; cppi_interface_t *nextpiv; @@ -172,8 +273,6 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t *U = NULL; int ldu = -1; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request ); - tileA = cti_interface_get(descr[0]); nextpiv = (cppi_interface_t*) descr[1]; prevpiv = (cppi_interface_t*) descr[2]; @@ -183,26 +282,26 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) ldu = tileU->ld; } - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " ); } - nextpiv->h = h; /* Initialize in case it uses a copy */ + nextpiv->h = clargs->h; /* Initialize in case it uses a copy */ nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag); coreblas_kernel_trace( tileA ); - CORE_zgetrf_panel_offdiag( m, n, h, m0, ib, + CORE_zgetrf_panel_offdiag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib, CHAM_tile_get_ptr(tileA), tileA->ld, U, ldu, &(nextpiv->pivot), &(prevpiv->pivot) ); - if ( h > 0 ) { + if ( clargs->h > 0 ) { cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " ); } - if ( h < tileA->n ) { + if ( clargs->h < tileA->n ) { cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " ); } } @@ -213,23 +312,23 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_blocked_offdiag, cl_zgetrf_blocked_offdiag_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, int m, int n, int h, int m0, int ib, CHAM_desc_t *A, int Am, int An, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag; - - int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; - int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; - int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; - int rankA = A->get_rankof(A, Am, An); #if !defined(HAVE_STARPU_NONE_NONZERO) /* STARPU_NONE can't be equal to 0 */ - fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" ); + fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" ); assert( 0 ); #endif + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + int rankA = A->get_rankof(A, Am, An); #if defined ( CHAMELEON_USE_MPI ) if ( rankA != A->myrank ) { @@ -244,6 +343,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, } #endif + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL; const char *cl_name = "zgetrf_blocked_offdiag"; @@ -260,19 +370,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, A->get_blktile( A, Am, An ) ); rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &An, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), + &cl_zgetrf_blocked_offdiag, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -280,19 +388,96 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, 0 ); } +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int m, int n, int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret; + struct starpu_task *task; + int rankA = A->get_rankof(A, Am, An); + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + +#if defined ( CHAMELEON_USE_MPI ) + if ( rankA != A->myrank ) { + if ( ( accessU != STARPU_NONE ) && + ( A->myrank == A->get_rankof( A, An, An ) ) ) + { + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + rankA, NULL, NULL ); + } + return; + } +#endif + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_offdiag, zgetrf_blocked_offdiag, zgetrf_blocked, 4 ); + + /* + * Register the data handles, exchange needed only for U + */ + starpu_cham_exchange_init_params( options, ¶ms, rankA ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ), access_npiv ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv ); + starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, + RTBLKADDR( U, ChamComplexDouble, Um, Un ), + accessU ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->m0 = m0; + clargs->ib = ib; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_offdiag_callback ); + + /* Flops */ + // task->flops = TODO; + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_offdiag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + #if !defined(CHAMELEON_SIMULATION) static const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) { - int m, n, h, ib; + struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg; CHAM_tile_t *tileU; cppi_interface_t *prevpiv; CHAMELEON_Complex64_t *U; int ldu; - starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &ib ); - tileU = cti_interface_get(descr[0]); prevpiv = (cppi_interface_t*) descr[1]; U = CHAM_tile_get_ptr( tileU ); @@ -301,16 +486,16 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) coreblas_kernel_trace( tileU ); /* Copy the final max line of the block and solve */ - cblas_zcopy( n, prevpiv->pivot.pivrow, 1, - U + m - 1, ldu ); + cblas_zcopy( clargs->n, prevpiv->pivot.pivrow, 1, + U + clargs->m - 1, ldu ); - if ( (n-h) > 0 ) { + if ( ( clargs->n - clargs->h ) > 0 ) { cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, - ib, n - h, - CBLAS_SADDR(zone), U + (h-ib) * ldu, ldu, - U + h * ldu, ldu ); + clargs->ib, clargs->n - clargs->h, + CBLAS_SADDR(zone), U + (clargs->h-clargs->ib) * ldu, ldu, + U + clargs->h * ldu, ldu ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -320,13 +505,13 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_blocked_trsm, cl_zgetrf_blocked_trsm_cpu_func) +#if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, int m, int n, int h, int ib, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ipiv ) { - struct starpu_codelet *codelet = &cl_zgetrf_blocked_trsm; - void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL; const char *cl_name = "zgetrf_blocked_trsm"; int rankU = U->get_rankof(U, Um, Un); @@ -344,17 +529,86 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, return; } + /* Set codelet parameters */ + struct cl_zgetrf_blocked_args_s *clargs; + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->ib = ib; + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &h, sizeof(int), - STARPU_VALUE, &ib, sizeof(int), + &cl_zgetrf_blocked_trsm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s), + + /* Task handles */ STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), STARPU_R, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), + + /* Common task arguments */ STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_NAME, cl_name, 0 ); } + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + int ret; + struct starpu_task *task; + int rankU = U->get_rankof(U, Um, Un); + + if ( U->myrank != rankU ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_trsm, zgetrf_blocked_trsm, zgetrf_blocked, 2 ); + + /* + * Register the data handles, no exchange needed + */ + starpu_cham_exchange_init_params( options, ¶ms, rankU ); + starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), STARPU_RW ); + starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), STARPU_R ); + + task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->h = h; + clargs->ib = ib; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_trsm_callback ); + + /* Flops */ + // task->flops = TODO; + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, U->get_blktile( U, Um, Un ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" ); + return; + } + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); +} + +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */