/**
 *
 * @file starpu/codelet_zlacpy.c
 *
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
 * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
 *                      Univ. Bordeaux. All rights reserved.
 *
 ***
 *
 * @brief Chameleon zlacpy StarPU codelet
 *
 * @version 1.3.0
 * @author Julien Langou
 * @author Henricus Bouwmeester
 * @author Mathieu Faverge
 * @author Emmanuel Agullo
 * @author Cedric Castagnede
 * @author Lucas Barros de Assis
 * @author Florent Pruvost
 * @author Samuel Thibault
 * @author Alycia Lisito
 * @date 2024-10-17
 * @precisions normal z -> c d s
 *
 */
#include "chameleon_starpu_internal.h"
#include "runtime_codelet_z.h"

struct cl_zlacpy_args_s {
    cham_uplo_t uplo;
    int m;
    int n;
    int displA;
    int displB;
    int lda;
    int ldb;
};

#if !defined(CHAMELEON_SIMULATION)
static void cl_zlacpy_starpu_func(void *descr[], void *cl_arg)
{
    static const struct starpu_data_interface_ops *interface_ops = &starpu_interface_cham_tile_ops;
    const struct starpu_data_copy_methods         *copy_methods  = interface_ops->copy_methods;
    struct cl_zlacpy_args_s                       *clargs        = (struct cl_zlacpy_args_s *)cl_arg;
    (void)clargs;

    int      workerid    = starpu_worker_get_id_check();
    unsigned memory_node = starpu_worker_get_memory_node( workerid );

    void *src_interface = descr[0];
    void *dst_interface = descr[1];

    int rc;
    (void)rc;

    assert( clargs->displA == 0 );
    assert( clargs->displB == 0 );

    rc = copy_methods->any_to_any( src_interface, memory_node,
                                   dst_interface, memory_node, NULL );
    assert( rc == 0 );
}

static void
cl_zlacpy_cpu_func(void *descr[], void *cl_arg)
{
    struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg;
    CHAM_tile_t *tileA;
    CHAM_tile_t *tileB;

    tileA = cti_interface_get(descr[0]);
    tileB = cti_interface_get(descr[1]);

    assert( clargs->displA == 0 );
    assert( clargs->displB == 0 );

    TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB );
}

static void
cl_zlacpyx_cpu_func(void *descr[], void *cl_arg)
{
    struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg;
    CHAM_tile_t *tileA;
    CHAM_tile_t *tileB;

    tileA = cti_interface_get(descr[0]);
    tileB = cti_interface_get(descr[1]);

    TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA,
                   tileA, clargs->lda, clargs->displB, tileB, clargs->ldb );
}
#endif /* !defined(CHAMELEON_SIMULATION) */

/*
 * Codelet definition
 */
CODELETS_CPU( zlacpy,  cl_zlacpy_cpu_func  )
CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func )
CODELETS( zlacpy_starpu, cl_zlacpy_starpu_func, cl_zlacpy_starpu_func, STARPU_CUDA_ASYNC )

static inline void
insert_task_zlacpy_on_local_node( const RUNTIME_option_t *options,
                                  starpu_data_handle_t handleA,
                                  starpu_data_handle_t handleB )
{
    void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL;
#if defined(CHAMELEON_RUNTIME_SYNC)
    starpu_data_cpy_priority( handleB, handleA, 0, callback, NULL, options->priority );
#else
    starpu_data_cpy_priority( handleB, handleA, 1, callback, NULL, options->priority );
#endif
}

#if defined(CHAMELEON_USE_MPI)
static inline void
insert_task_zlacpy_on_remote_node( const RUNTIME_option_t *options,
                                   starpu_data_handle_t handleA,
                                   starpu_data_handle_t handleB )
{
    void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL;
#if defined(CHAMELEON_RUNTIME_SYNC)
    starpu_mpi_data_cpy_priority( handleB, handleA, options->sequence->comm, 0, callback, NULL, options->priority );
#else
    starpu_mpi_data_cpy_priority( handleB, handleA, options->sequence->comm, 1, callback, NULL, options->priority );
#endif
}
#endif

void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int m, int n,
                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb )
{
    int          exec    = 0;
    char        *cl_name = "zlacpyx";
    CHAM_tile_t *tileA   = A->get_blktile( A, Am, An );
    CHAM_tile_t *tileB   = B->get_blktile( B, Bm, Bn );

    /* Handle cache */
    CHAMELEON_BEGIN_ACCESS_DECLARATION;
    CHAMELEON_ACCESS_R( A, Am, An );
    CHAMELEON_ACCESS_W( B, Bm, Bn );
    exec = __chameleon_need_exec;
    CHAMELEON_END_ACCESS_DECLARATION;

#if !defined(CHAMELEON_USE_MPI) || defined(HAVE_STARPU_MPI_DATA_CPY_PRIORITY)
    /* Insert the task */
    if ( (!options->withlacpy) &&
         (uplo == ChamUpperLower) &&
         (tileA->m == m) && (tileA->n == n) &&
         (tileB->m == m) && (tileB->n == n) &&
         (displA == 0) && (displB == 0) )
    {
#if defined(CHAMELEON_USE_MPI)
        insert_task_zlacpy_on_remote_node( options,
                                           RTBLKADDR(A, ChamComplexDouble, Am, An),
                                           RTBLKADDR(B, ChamComplexDouble, Bm, Bn) );
#else
        insert_task_zlacpy_on_local_node( options,
                                          RTBLKADDR(A, ChamComplexDouble, Am, An),
                                          RTBLKADDR(B, ChamComplexDouble, Bm, Bn) );
#endif
    }
    else
#endif
    {
        struct cl_zlacpy_args_s *clargs = NULL;
        void (*callback)(void*);

        if ( exec ) {
            clargs = malloc( sizeof( struct cl_zlacpy_args_s ) );
            clargs->uplo   = uplo;
            clargs->m      = m;
            clargs->n      = n;
            clargs->displA = displA;
            clargs->displB = displB;
            clargs->lda    = lda;
            clargs->ldb    = ldb;
        }

        /* Callback fro profiling information */
        callback = options->profiling ? cl_zlacpyx_callback : NULL;

        /* Insert the task */
        rt_starpu_insert_task(
            &cl_zlacpyx,
            /* Task codelet arguments */
            STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s),
            STARPU_R,      RTBLKADDR(A, ChamComplexDouble, Am, An),
            STARPU_W,      RTBLKADDR(B, ChamComplexDouble, Bm, Bn),

            /* Common task arguments */
            STARPU_PRIORITY,          options->priority,
            STARPU_CALLBACK,          callback,
            STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
            STARPU_NAME,              cl_name,
#endif

            0 );
    }

    (void)tileA;
    (void)tileB;
}

void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, int m, int n,
                         const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn )
{
    int          exec    = 0;
    char        *cl_name = "zlacpy";
    CHAM_tile_t *tileA   = A->get_blktile( A, Am, An );
    CHAM_tile_t *tileB   = B->get_blktile( B, Bm, Bn );

        /* Handle cache */
    CHAMELEON_BEGIN_ACCESS_DECLARATION;
    CHAMELEON_ACCESS_R(A, Am, An);
    CHAMELEON_ACCESS_W(B, Bm, Bn);
    exec = __chameleon_need_exec;
    CHAMELEON_END_ACCESS_DECLARATION;

#if !defined(CHAMELEON_USE_MPI) || defined(HAVE_STARPU_MPI_DATA_CPY_PRIORITY)
    /* Insert the task */
    if ( (!options->withlacpy) &&
         (uplo == ChamUpperLower) &&
         (tileA->m == m) && (tileA->n == n) &&
         (tileB->m == m) && (tileB->n == n) )
    {
#if defined(CHAMELEON_USE_MPI)
        insert_task_zlacpy_on_remote_node( options,
                                           RTBLKADDR(A, ChamComplexDouble, Am, An),
                                           RTBLKADDR(B, ChamComplexDouble, Bm, Bn) );
#else
        insert_task_zlacpy_on_local_node( options,
                                          RTBLKADDR(A, ChamComplexDouble, Am, An),
                                          RTBLKADDR(B, ChamComplexDouble, Bm, Bn) );
#endif
    }
    else
#endif
    {
        struct cl_zlacpy_args_s *clargs = NULL;
        void (*callback)(void*);

        if ( exec ) {
            clargs = malloc( sizeof( struct cl_zlacpy_args_s ) );
            clargs->uplo   = uplo;
            clargs->m      = m;
            clargs->n      = n;
            clargs->displA = 0;
            clargs->displB = 0;
            clargs->lda    = tileA->ld;
            clargs->ldb    = tileB->ld;
        }

        /* Callback for profiling information */
        callback = options->profiling ? cl_zlacpy_callback : NULL;

        rt_starpu_insert_task(
            &cl_zlacpy,
            /* Task codelet arguments */
            STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s),
            STARPU_R,      RTBLKADDR(A, ChamComplexDouble, Am, An),
            STARPU_W,      RTBLKADDR(B, ChamComplexDouble, Bm, Bn),

            /* Common task arguments */
            STARPU_PRIORITY,          options->priority,
            STARPU_CALLBACK,          callback,
            STARPU_EXECUTE_ON_WORKER, options->workerid,
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
            STARPU_NAME,              cl_name,
#endif

            0 );
    }
}