diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index a4cb1fc264233227fa012b5ce8f38907bdfb0a53..d6908a87c2081353336d2615bbc040a0cddff030 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -20,6 +20,7 @@ _extra_blas = [ ('', 'slaran', 'dlaran', 'slaran', 'dlaran' ), ('', 'slatm1', 'dlatm1', 'clatm1', 'zlatm1' ), ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), + ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index b774d7dad0cef277d2d6029963a7974c61089dda..dd70cd7c7cffd468d94b575ae166efe08903693b 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -30,6 +30,7 @@ set(COREBLAS_SRCS_GENERATED "") set(ZSRC + core_dlag2z.c core_dzasum.c core_zaxpy.c core_zgeadd.c diff --git a/coreblas/compute/core_dlag2z.c b/coreblas/compute/core_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..498a73c5fbb6f23cca1b74ebe6f3fc8604238e3d --- /dev/null +++ b/coreblas/compute/core_dlag2z.c @@ -0,0 +1,98 @@ +/** + * + * @file core_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_dlag2z CPU kernel + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "coreblas.h" + +/** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Converts a m-by-n matrix A from double real to double complex. + * + ******************************************************************************* + * + * @param[in] M + * The number of rows of the matrix A. + * m >= 0. + * + * @param[in] N + * The number of columns of the matrix A. + * n >= 0. + * + * @param[in] A + * The lda-by-n matrix in double complex precision to convert. + * + * @param[in] LDA + * The leading dimension of the matrix A. + * lda >= max(1,m). + * + * @param[out] B + * On exit, the converted LDB-by-n matrix in single complex precision. + * + * @param[in] LDB + * The leading dimension of the matrix As. + * ldas >= max(1,m). + * + */ +void +CORE_dlag2z( cham_uplo_t uplo, int M, int N, + const double *A, int lda, + CHAMELEON_Complex64_t *B, int ldb ) +{ + const double *Aptr; + CHAMELEON_Complex64_t *Bptr; + int i, j; + + if ( (uplo != ChamUpperLower) && + (uplo != ChamUpper) && + (uplo != ChamLower)) + { + coreblas_error(1, "illegal value of uplo"); + return; + } + + if (M < 0) { + coreblas_error(2, "Illegal value of m"); + return; + } + if (N < 0) { + coreblas_error(3, "Illegal value of n"); + return; + } + if ( (lda < chameleon_max(1,M)) && (M > 0) ) { + coreblas_error(5, "Illegal value of lda"); + return; + } + if ( (ldb < chameleon_max(1,M)) && (M > 0) ) { + coreblas_error(7, "Illegal value of ldb"); + return; + } + + for(j=0; j<N; j++) { + int mmin = ( uplo == ChamLower ) ? j : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(j+1, M) : M; + + Aptr = A + lda * j + mmin; + Bptr = B + ldb * j + mmin; + + for(i=mmin; i<mmax; i++, Aptr++, Bptr++) { + *Bptr = (CHAMELEON_Complex64_t)(*Aptr); + } + } +} diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 173855a3b5bd858e80c0aaec4c823a692f2e3345..377c47253aa1a162acbf32f58185aa2ba7ec1167 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -18,6 +18,18 @@ #include "coreblas.h" #include "coreblas/coreblas_ztile.h" +#if defined( PRECISION_z ) || defined( PRECISION_c ) +void +TCORE_dlag2z( cham_uplo_t uplo, int M, int N, + const CHAM_tile_t *A, + CHAM_tile_t *B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_dlag2z( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); +} +#endif + void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 111e0b544259f045ddd7785379d7f8127ae483f8..806337af2bd47532a6ffe8fb89afc656dab6acae 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -30,6 +30,9 @@ /** * Declarations of serial kernels - alphabetical order */ +void CORE_dlag2z( cham_uplo_t uplo, int M, int N, + const double *A, int lda, + CHAMELEON_Complex64_t *B, int ldb ); void CORE_dzasum(cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAMELEON_Complex64_t *A, int lda, double *work); int CORE_zaxpy(int M, CHAMELEON_Complex64_t alpha, diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index b5de9f7667d56514c583919f94f52e6a2798ff3a..ccb6b83b2617c0486f914980516d437dd1747473 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -16,6 +16,9 @@ #ifndef _coreblas_ztile_h_ #define _coreblas_ztile_h_ +#if defined(PRECISION_z) || defined(PRECISION_c) +void TCORE_dlag2z( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +#endif void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, double *work ); int TCORE_zaxpy( int M, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, int incA, CHAM_tile_t *B, int incB ); int TCORE_zgeadd( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *B ); @@ -28,18 +31,18 @@ int TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ); int TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ); int TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ); void TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); void TCORE_zherk( cham_uplo_t uplo, cham_trans_t trans, int N, int K, double alpha, const CHAM_tile_t *A, double beta, CHAM_tile_t *C ); void TCORE_zher2k( cham_uplo_t uplo, cham_trans_t trans, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, double beta, CHAM_tile_t *C ); #endif int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_tile_t *A, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int ldwork ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); #endif void TCORE_zlansy( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); @@ -49,7 +52,7 @@ void TCORE_zlaset( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zplghe( double bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); #endif void TCORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 6df71da4c29ef3c015f39feeace6e403738b064b..3cb7c5bc289978d2816ad1b374e8cad3b984663b 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -30,6 +30,10 @@ /** * Declarations of QUARK wrappers (called by CHAMELEON) - alphabetical order */ +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_dzasum( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 71724813e89887a857b5d54aa5f8c82146097361..76a0bce724412de90b0e0e1c094782ebebb518b3 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -28,6 +28,7 @@ # List of codelets required by all runtimes # ----------------------------------------- set(CODELETS_ZSRC + codelets/codelet_dlag2z.c codelets/codelet_dzasum.c ################## # BLAS 1 diff --git a/runtime/openmp/codelets/codelet_dlag2z.c b/runtime/openmp/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..077f541fef83a22f7e4d1b163002475df254db3a --- /dev/null +++ b/runtime/openmp/codelets/codelet_dlag2z.c @@ -0,0 +1,34 @@ +/** + * + * @file openmp/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z OpenMP codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + +#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( out:tileB[0] ) + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} diff --git a/runtime/parsec/codelets/codelet_dlag2z.c b/runtime/parsec/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..60868494412940b0d2397daa7c544756c941e4bc --- /dev/null +++ b/runtime/parsec/codelets/codelet_dlag2z.c @@ -0,0 +1,62 @@ +/** + * + * @file parsec/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z PaRSEC codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_dlag2z_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m; + int n; + double *A; + int lda; + CHAMELEON_Complex64_t *B; + int ldb; + + parsec_dtd_unpack_args( this_task, &uplo, &m, &n, &A, &lda, &B, &ldb ); + CORE_dlag2z( uplo, m, n, A, lda, B, ldb ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_dlag2z_parsec, options->priority, "dlag2z", + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT, + sizeof(int), &(tileB->ld), VALUE, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/quark/codelets/codelet_dlag2z.c b/runtime/quark/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..86e214347b028782f7aee4cda9dc426a78fe1b3c --- /dev/null +++ b/runtime/quark/codelets/codelet_dlag2z.c @@ -0,0 +1,53 @@ +/** + * + * @file quark/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z Quark codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +static inline void +CORE_dlag2z_quark( Quark *quark ) +{ + cham_uplo_t uplo; + int m; + int n; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + quark_unpack_args_5( quark, uplo, m, n, tileA, tileB ); + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + + //DAG_CORE_DLAG2Z; + QUARK_Insert_Task( + opt->quark, CORE_dlag2z_quark, (Quark_Task_Flags*)opt, + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(void*), RTBLKADDR(A, double, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, + 0); +} diff --git a/runtime/starpu/codelets/codelet_dlag2z.c b/runtime/starpu/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..b7953cd977431f2b0599d5470ce4fbd6d7834ccb --- /dev/null +++ b/runtime/starpu/codelets/codelet_dlag2z.c @@ -0,0 +1,79 @@ +/** + * + * @file starpu/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z StarPU codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_dlag2z_cpu_func(void *descr[], void *cl_arg) +{ + cham_uplo_t uplo; + int m; + int n; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n); + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(dlag2z, cl_dlag2z_cpu_func) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + struct starpu_codelet *codelet = &cl_dlag2z; + void (*callback)(void*) = options->profiling ? cl_dlag2z_callback : NULL; + starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); + int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &uplo, sizeof(uplo), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, double, Am, An), + STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2z", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index f810ab0f07d92c320459072022496d68906f1c30..ef24f40dc6e12a38badbfd26524a1adf9c3fda69 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -22,6 +22,9 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" +#if defined(PRECISION_z) || defined(PRECISION_c) +CHAMELEON_CL_CB(dlag2z, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), 0, M*N) +#endif CHAMELEON_CL_CB(dzasum, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zaxpy, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[1]), 0, M) CHAMELEON_CL_CB(zgeadd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 25a14085fb7579fd40f8bb1800b3029d98cf548d..95bc484a4f475027424c2429b10dc3c5eba2a1fd 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -106,6 +106,9 @@ CODELETS_HEADER(zlag2c); /* * DZ functions */ +#if defined(PRECISION_z) || defined(PRECISION_c) +CODELETS_HEADER(dlag2z); +#endif CODELETS_HEADER(dzasum); /*