Newer
Older
/**
*
* @file starpu/codelet_zgetrf_batched.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zpanel batched StarPU codelets
*
* @version 1.2.0
* @comment Codelets to perform batched panel factorization with partial pivoting
*
* @author Matthieu Kuhn
* @author Alycia Lisito
* @author Philippe Swartvagher
* @date 2024-01-11
* @precisions normal z -> c d s
*
*/
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
#include <coreblas/cblas_wrapper.h>
struct cl_getrf_batched_args_t {
int tasks_nbr;
int diag;
int h;
int ib;
int m[CHAMELEON_BATCH_SIZE];
int n[CHAMELEON_BATCH_SIZE];
int m0[CHAMELEON_BATCH_SIZE];
struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE];
};
#if !defined(CHAMELEON_SIMULATION)
static void
cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[],
void *cl_arg )
{
struct cl_getrf_batched_args_t *clargs = (struct cl_getrf_batched_args_t *) cl_arg;
cppi_interface_t *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ];
cppi_interface_t *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ];
int i, m, n, h, m0, lda;
CHAM_tile_t *tileA;
nextpiv->h = clargs->h;
nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag );
for ( i = 0; i < clargs->tasks_nbr; i++ ) {
tileA = cti_interface_get( descr[ i ] );
lda = tileA->ld;
m = clargs->m[ i ];
n = clargs->n[ i ];
h = clargs->h;
m0 = clargs->m0[ i ];
CORE_zgetrf_panel_offdiag( m, n, h, m0, n, CHAM_tile_get_ptr(tileA), lda,
NULL, -1, &( nextpiv->pivot ), &( prevpiv->pivot ) );
}
}
#endif /* !defined(CHAMELEON_SIMULATION) */
CODELETS_CPU( zgetrf_panel_offdiag_batched, cl_zgetrf_panel_offdiag_batched_cpu_func )
void
INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
int m, int n, int h, int m0,
void *ws,
CHAM_desc_t *A, int Am, int An,
void **clargs_ptr,
CHAM_ipiv_t *ipiv )
{
int task_num = 0;
int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
void (*callback)(void*) = NULL;
struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
int rankA = A->get_rankof( A, Am, An );
if ( rankA != A->myrank ) {
return;
}
#if !defined ( HAVE_STARPU_NONE_NONZERO )
/* STARPU_NONE can't be equal to 0 */
fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
assert( 0 );
#endif
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_RW(A, Am, An);
CHAMELEON_END_ACCESS_DECLARATION;
if ( clargs == NULL ) {
clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
clargs->tasks_nbr = 0;
clargs->h = h;
clargs->cl_name = "zgetrf_panel_offdiag_batched";
*clargs_ptr = clargs;
}
task_num = clargs->tasks_nbr;
clargs->m[ task_num ] = m;
clargs->n[ task_num ] = n;
clargs->m0[ task_num ] = m0;
clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
clargs->handle_mode[ task_num ].mode = STARPU_RW;
clargs->tasks_nbr ++;
/* Refine name */
clargs->cl_name = chameleon_codelet_name( clargs->cl_name, 1,
A->get_blktile( A, Am, An ) );
if ( clargs->tasks_nbr == batch_size ) {
int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R;
rt_starpu_insert_task(
&cl_zgetrf_panel_offdiag_batched,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t),
STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr,
access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, clargs->cl_name,
#endif
0);
/* clargs is freed by starpu. */
*clargs_ptr = NULL;
}
}
void
INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
CHAM_desc_t *A, int An,
void **clargs_ptr,
CHAM_ipiv_t *ipiv )
{
void (*callback)(void*) = NULL;
struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
int rankA = A->myrank;
#if !defined ( HAVE_STARPU_NONE_NONZERO )
/* STARPU_NONE can't be equal to 0 */
fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
assert( 0 );
#endif
if ( clargs == NULL ) {
return;
}
int access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
int access_ppiv = ( clargs->h == 0 ) ? STARPU_NONE : STARPU_R;
rt_starpu_insert_task(
&cl_zgetrf_panel_offdiag_batched,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t),
STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr,
access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ),
access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, clargs->cl_name,
#endif
0);
/* clargs is freed by starpu. */
*clargs_ptr = NULL;
}
#if !defined(CHAMELEON_SIMULATION)
static void
cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
void *cl_arg )
{
struct cl_getrf_batched_args_t *clargs = ( struct cl_getrf_batched_args_t * ) cl_arg;
int *ipiv;
cppi_interface_t *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ];
cppi_interface_t *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
int i, h, ib;
CHAM_tile_t *tileA, *tileU;
CHAMELEON_Complex64_t *U = NULL;
int ldu = -1;
nextpiv->h = clargs->h;
nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
h = clargs->h;
ib = clargs->ib;
i = 0;
if ( clargs->diag ) {
if ( h == 0 ) {
ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 1]);
}
else {
ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[clargs->tasks_nbr + 2]);
}
if ( h != 0 ) {
tileU = cti_interface_get( descr[ clargs->tasks_nbr + 3 ] );
U = CHAM_tile_get_ptr( tileU );
ldu = tileU->ld;
}
tileA = cti_interface_get( descr[ 0 ] );
nextpiv->has_diag = 1;
CORE_zgetrf_panel_diag( clargs->m[i], clargs->n[i], h, clargs->m0[i], ib,
CHAM_tile_get_ptr( tileA ), tileA->ld,
U, ldu,
ipiv, &(nextpiv->pivot), &(prevpiv->pivot) );
i++;
}
if ( ( h%ib == 0 ) && ( h > 0 ) ) {
tileU = cti_interface_get( descr[ clargs->tasks_nbr + 2 + clargs->diag ] );
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
U = CHAM_tile_get_ptr( tileU );
ldu = tileU->ld;
}
else {
U = NULL;
ldu = -1;
}
for ( ; i < clargs->tasks_nbr; i++ ) {
tileA = cti_interface_get( descr[ i ] );
CORE_zgetrf_panel_offdiag( clargs->m[i], clargs->n[i], h, clargs->m0[i], ib,
CHAM_tile_get_ptr(tileA), tileA->ld,
U, ldu,
&( nextpiv->pivot ), &( prevpiv->pivot ) );
}
}
#endif /* !defined(CHAMELEON_SIMULATION) */
CODELETS_CPU( zgetrf_panel_blocked_batched, cl_zgetrf_panel_blocked_batched_cpu_func )
void
INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
int m, int n, int h, int m0,
void *ws,
CHAM_desc_t *A, int Am, int An,
CHAM_desc_t *U, int Um, int Un,
void **clargs_ptr,
CHAM_ipiv_t *ipiv )
{
int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size;
int ib = ((struct chameleon_pzgetrf_s *)ws)->ib;
int task_num = 0;
void (*callback)(void*) = NULL;
int accessU, access_npiv, access_ipiv, access_ppiv;
struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
int rankA = A->get_rankof(A, Am, An);
#if !defined ( HAVE_STARPU_NONE_NONZERO )
/* STARPU_NONE can't be equal to 0 */
fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
assert( 0 );
#endif
#if defined ( CHAMELEON_USE_MPI )
if ( ( Am == An ) && ( h % ib == 0 ) && ( h > 0 ) ) {
starpu_mpi_cache_flush( options->sequence->comm,
RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) );
}
if ( rankA != A->myrank ) {
if ( ( h % ib == 0 ) && ( h > 0 ) && ( A->myrank == A->get_rankof( A, An, An ) ) ) {
starpu_mpi_get_data_on_node_detached( options->sequence->comm,
RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
rankA, NULL, NULL );
}
return;
}
#endif
/* Handle cache */
CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_RW(A, Am, An);
CHAMELEON_END_ACCESS_DECLARATION;
if ( clargs == NULL ) {
clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) );
memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
clargs->tasks_nbr = 0;
clargs->diag = ( Am == An );
clargs->ib = ib;
clargs->h = h;
clargs->cl_name = "zgetrf_panel_blocked_batched";
*clargs_ptr = clargs;
}
task_num = clargs->tasks_nbr;
clargs->m[ task_num ] = m;
clargs->n[ task_num ] = n;
clargs->m0[ task_num ] = m0;
clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
clargs->handle_mode[ task_num ].mode = STARPU_RW;
clargs->tasks_nbr ++;
/* Refine name */
clargs->cl_name = chameleon_codelet_name( clargs->cl_name, 1,
A->get_blktile( A, Am, An ) );
if ( clargs->tasks_nbr == batch_size ) {
access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
access_ipiv = STARPU_RW;
access_ppiv = STARPU_R;
accessU = STARPU_RW;
if ( clargs->h == 0 ) {
access_ipiv = STARPU_W;
access_ppiv = STARPU_NONE;
accessU = STARPU_NONE;
}
else if ( clargs->h % clargs->ib == 0 ) {
accessU = STARPU_R;
}
else if ( clargs->h % clargs->ib == 1 ) {
accessU = STARPU_W;
}
/* If there isn't a diag task then use offdiag access */
if ( clargs->diag == 0 ) {
accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
access_ipiv = STARPU_NONE;
}
rt_starpu_insert_task(
&cl_zgetrf_panel_blocked_batched,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t),
STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr,
access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ),
accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, clargs->cl_name,
#endif
0);
/* clargs is freed by starpu. */
*clargs_ptr = NULL;
}
}
void
INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
CHAM_desc_t *A, int An,
CHAM_desc_t *U, int Um, int Un,
void **clargs_ptr,
CHAM_ipiv_t *ipiv )
{
int accessU, access_npiv, access_ipiv, access_ppiv;
void (*callback)(void*) = NULL;
struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
int rankA = A->myrank;
#if !defined ( HAVE_STARPU_NONE_NONZERO )
/* STARPU_NONE can't be equal to 0 */
fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
assert( 0 );
#endif
if ( clargs == NULL ) {
return;
}
access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
access_ipiv = STARPU_RW;
access_ppiv = STARPU_R;
accessU = STARPU_RW;
if ( clargs->h == 0 ) {
access_ipiv = STARPU_W;
access_ppiv = STARPU_NONE;
accessU = STARPU_NONE;
}
else if ( clargs->h % clargs->ib == 0 ) {
accessU = STARPU_R;
}
else if ( clargs->h % clargs->ib == 1 ) {
accessU = STARPU_W;
}
/* If there isn't a diag task then use offdiag access */
if ( clargs->diag == 0 ) {
accessU = ((clargs->h%clargs->ib == 0) && (clargs->h > 0)) ? STARPU_R : STARPU_NONE;
access_ipiv = STARPU_NONE;
}
rt_starpu_insert_task(
&cl_zgetrf_panel_blocked_batched,
/* Task codelet arguments */
STARPU_CL_ARGS, clargs, sizeof(struct cl_getrf_batched_args_t),
STARPU_DATA_MODE_ARRAY, clargs->handle_mode, clargs->tasks_nbr,
access_npiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ),
access_ppiv, RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ),
access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ),
accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, clargs->cl_name,
#endif
0);
/* clargs is freed by starpu. */
*clargs_ptr = NULL;
}