From 9691baacb1972b7444aef4ff533ce2626ea6d41a Mon Sep 17 00:00:00 2001 From: PRUVOST Florent Date: Fri, 10 May 2019 18:07:29 +0200 Subject: [PATCH] update ge/he/syssq algos to make them more generic and get the possibility to accumulate squares over columns or rows --- cmake_modules/local_subs.py | 1 + compute/CMakeLists.txt | 5 + compute/pzgram.c | 209 ++++++++++++++++ compute/zgram.c | 236 ++++++++++++++++++ control/compute_z.h | 5 +- coreblas/compute/CMakeLists.txt | 1 + coreblas/compute/core_zgessq.c | 18 ++ coreblas/compute/core_zgram.c | 141 +++++++++++ coreblas/compute/core_zplssq.c | 30 ++- coreblas/compute/core_zsyssq.c | 14 ++ coreblas/include/coreblas/coreblas_z.h | 9 + coreblas/include/coreblas/sumsq_update.h | 6 +- include/chameleon/chameleon_z.h | 7 + include/chameleon/tasks_z.h | 11 + runtime/CMakeLists.txt | 4 + runtime/openmp/codelets/codelet_zgram.c | 43 ++++ runtime/openmp/codelets/codelet_zplssq.c | 2 +- runtime/parsec/codelets/codelet_zgessq.c | 2 +- runtime/parsec/codelets/codelet_zgram.c | 131 ++++++++++ runtime/parsec/codelets/codelet_zplssq.c | 6 +- runtime/parsec/codelets/codelet_zsyssq.c | 2 +- runtime/quark/codelets/codelet_zgessq.c | 15 +- runtime/quark/codelets/codelet_zgram.c | 68 +++++ runtime/quark/codelets/codelet_zplssq.c | 4 +- runtime/quark/codelets/codelet_zsyssq.c | 13 +- runtime/quark/codelets/codelet_ztrssq.c | 1 + runtime/quark/include/core_blas_dag.h | 8 + runtime/starpu/codelets/codelet_zcallback.c | 3 +- runtime/starpu/codelets/codelet_zgram.c | 92 +++++++ runtime/starpu/include/runtime_codelet_z.h | 1 + testing/CMakeLists.txt | 1 + testing/CTestLists.cmake | 8 + testing/testing_dgram.c | 261 ++++++++++++++++++++ testing/testing_zauxiliary.c | 12 +- testing/testing_zauxiliary.h | 6 +- 35 files changed, 1350 insertions(+), 26 deletions(-) create mode 100644 compute/pzgram.c create mode 100644 compute/zgram.c create mode 100644 coreblas/compute/core_zgram.c create mode 100644 runtime/openmp/codelets/codelet_zgram.c create mode 100644 runtime/parsec/codelets/codelet_zgram.c create mode 100644 runtime/quark/codelets/codelet_zgram.c create mode 100644 runtime/starpu/codelets/codelet_zgram.c create mode 100644 testing/testing_dgram.c diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 4608e59b8..51fcb8ec2 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -14,6 +14,7 @@ _extra_blas = [ ('', 'she2ge', 'dhe2ge', 'che2ge', 'zhe2ge' ), ('', 'slatro', 'dlatro', 'clatro', 'zlatro' ), #=> Replace by getmo/gecmo as in essl ('', 'sbuild', 'dbuild', 'cbuild', 'zbuild' ), #=> Replace by map function + ('', 'sgram', 'dgram', 'cgram', 'zgram' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 96493134c..3ca2aaaa1 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -242,6 +242,11 @@ set(ZSRC ################## zbuild.c pzbuild.c + ################## + # Gram + ################## + zgram.c + pzgram.c ) precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}" diff --git a/compute/pzgram.c b/compute/pzgram.c new file mode 100644 index 000000000..c8fc0b34e --- /dev/null +++ b/compute/pzgram.c @@ -0,0 +1,209 @@ +/** + * + * @file pzgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram parallel algorithm + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" + +#define A( m, n ) A, (m), (n) +#define W( desc, m, n ) (desc), (m), (n) + +static inline void +chameleon_pzgram_internal( cham_uplo_t uplo, + CHAM_desc_t *A, + CHAM_desc_t *Wcol, + CHAM_desc_t *Welt, + RUNTIME_option_t *options ) +{ + int m, n; + int MT = A->mt; + int NT = A->nt; + int M = A->m; + int N = A->n; + int P = Welt->p; + int Q = Welt->q; + + /** + * 1) compute (scl,ssq) over columns in each tile + */ + for(n = 0; n < NT; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT; + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + for(m = mmin; m < mmax; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + int ldam = BLKLDD( A, m ); + + if ( n == m ) { + INSERT_TASK_dsyssq( + options, ChamColumnwise, uplo, tempmm, + A(m, n), ldam, W( Wcol, m, n) ); + } + else { + INSERT_TASK_dgessq( + options, ChamColumnwise, tempmm, tempnn, + A(m, n), ldam, W( Wcol, m, n) ); + if ( uplo != ChamUpperLower ) { + INSERT_TASK_dgessq( + options, ChamRowwise, tempmm, tempnn, + A(m, n), ldam, W( Wcol, n, m) ); + } + } + } + } + + for(n = 0; n < NT; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + /** + * 2) reduce columns (scl,ssq) tiles per processus (between lines) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wcol, m, n ), + W( Wcol, m%P, n ) ); + } + + /** + * 3) reduce columns (scl,ssq) tiles on the first line of tiles + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wcol, m, n ), + W( Wcol, 0, n ) ); + } + + /* 4) reduce (scl,ssq) inside each tile of the first line of tiles for the global sum square */ + INSERT_TASK_dplssq( + options, ChamEltwise, 1, tempnn, + W( Wcol, 0, n ), + W( Welt, 0, n ) ); + + /* 5) deduce the sum square for each column from the pairs (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, tempnn, W( Wcol, 0, n ) ); + } + + /* 6) reduce global sum squares on each processus (between columns) */ + for(n = Q; n < NT; n++) { + INSERT_TASK_dplssq( options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, n%Q) ); + } + + /* 7) reduce global sum squares on the first tile (index 0, 0) */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, 0) ); + } + + /* 8) deduce the global sum square from the pair (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, 1, W( Welt, 0, 0) ); + + /* Finally compute Gram matrix coefficients inplace */ + for(n = 0; n < NT; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT; + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + for(m = mmin; m < mmax; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + int ldam = BLKLDD( A, m ); + + INSERT_TASK_zgram( + options, + ( m == n ) ? uplo : ChamUpperLower, + A->m, A->n, tempmm, tempnn, + W( Wcol, 0, m ), 2, + W( Wcol, 0, n ), 2, + W( Welt, 0, 0 ), + A( m, n ), ldam ); + } + } +} + +/** + * + */ +void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t Wcol; + CHAM_desc_t Welt; + + int workmt, worknt; + int m, n, tempmm, tempnn, ldw; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + workmt = chameleon_max( A->mt, A->p ); + worknt = chameleon_max( A->nt, A->q ); + + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, A->nb, 2*A->nb, + 2*workmt, A->n, 0, 0, 2*workmt, A->n, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, 1, 2, + 2, worknt, 0, 0, 2, worknt, A->p, A->q, + NULL, NULL, NULL ); + + /* Initialize Wcol */ + for(m = 0; m < Wcol.mt; m++) { + tempmm = m == Wcol.mt-1 ? Wcol.m-m*Wcol.mb : Wcol.mb; + ldw = Wcol.get_blkldd(&Wcol, m); + for(n = 0; n < Wcol.nt; n++) { + tempnn = n == Wcol.nt-1 ? Wcol.n-n*Wcol.nb : Wcol.nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( &Wcol, m, n ), ldw ); + } + } + /* Initialize Welt */ + for(m = 0; m < Welt.mt; m++) { + tempmm = m == Welt.mt-1 ? Welt.m-m*Welt.mb : Welt.mb; + ldw = Welt.get_blkldd(&Welt, m); + for(n = 0; n < Welt.nt; n++) { + tempnn = n == Welt.nt-1 ? Welt.n-n*Welt.nb : Welt.nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( &Welt, m, n ), ldw ); + } + } + + chameleon_pzgram_internal( uplo, A, &Wcol, &Welt, &options ); + + CHAMELEON_Desc_Flush( &Wcol, sequence ); + CHAMELEON_Desc_Flush( &Welt, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + + chameleon_desc_destroy( &Wcol ); + chameleon_desc_destroy( &Welt ); + + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zgram.c b/compute/zgram.c new file mode 100644 index 000000000..602856a52 --- /dev/null +++ b/compute/zgram.c @@ -0,0 +1,236 @@ +/** + * + * @file zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram wrappers + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * CHAMELEON_zgram replace a general matrix by the Gram matrix inplace + * + * \f[ + * d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\ + * A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2) + * \f] + * + ******************************************************************************* + * + * @param[in] N + * The number of lines and columns of the matrix A. N >= 0. When N = 0, + * the returned value is set to zero. + * + * @param[in] A + * The N-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + ******************************************************************************* + * +* @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram_Tile + * @sa CHAMELEON_zgram_Tile_Async + * @sa CHAMELEON_sgram + * + */ +int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA ) +{ + int NB; + int status; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (N < 0) { + chameleon_error("CHAMELEON_zgram", "illegal value of N"); + return -1; + } + if (LDA < chameleon_max(1, N)) { + chameleon_error("CHAMELEON_zgram", "illegal value of LDA"); + return -3; + } + + /* Quick return */ + if (N == 0) + return CHAMELEON_SUCCESS; + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_DGEMM, N, N, 0); + if (status != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgram", "chameleon_tune() failed"); + return status; + } + + /* Set NT */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, uplo, + A, NB, NB, LDA, N, N, N, sequence, &request ); + + /* Call the tile interface */ + CHAMELEON_zgram_Tile_Async( uplo, &descAt, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInout, uplo, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zgram_Tile - Tile equivalent of CHAMELEON_zgram(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] A + * The N-by-N matrix A. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram + * @sa CHAMELEON_zgram_Tile_Async + * @sa CHAMELEON_sgram_Tile + * + */ +int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgram_Tile_Async( uplo, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * CHAMELEON_zgram_Tile_Async - Non-blocking equivalent of CHAMELEON_zgram_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram + * @sa CHAMELEON_zgram_Tile + * @sa CHAMELEON_sgram_Tile_Async + * + */ +int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgram_Tile", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgram_Tile", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Quick return */ + if (chameleon_min(A->m, A->n) == 0) { + return CHAMELEON_SUCCESS; + } + + chameleon_pzgram( uplo, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index e34ec4dfc..196fde204 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -115,7 +115,10 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); - +/** + * Gram function prototypes + */ +void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); /** * LAPACK/Tile Descriptor accesses diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index d44a83fd7..283659eed 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -44,6 +44,7 @@ set(ZSRC core_zgetrf.c core_zgetrf_incpiv.c core_zgetrf_nopiv.c + core_zgram.c core_zhe2ge.c core_zherfb.c core_zhemm.c diff --git a/coreblas/compute/core_zgessq.c b/coreblas/compute/core_zgessq.c index 71aa407bd..8b02bb10a 100644 --- a/coreblas/compute/core_zgessq.c +++ b/coreblas/compute/core_zgessq.c @@ -174,6 +174,24 @@ int CORE_zgessq( cham_store_t storev, int M, int N, const CHAMELEON_Complex64_t *A, int LDA, double *sclssq ) { + int i; + int K; + + /* initialize pairs scale, sumsquare if not already done */ + if ( storev == ChamColumnwise ) { + K = N; + } else if ( storev == ChamRowwise ) { + K = M; + } else { + K = 1; + } + for (i=0; i<2*K; i+=2) { + if ( ( sclssq[i] == -1. ) && ( sclssq[i+1] == -1. ) ) { + sclssq[i] = 1.; + sclssq[i+1] = 0.; + } + } + if (storev == ChamColumnwise) { CORE_zgessq_col( M, N, A, LDA, sclssq ); } diff --git a/coreblas/compute/core_zgram.c b/coreblas/compute/core_zgram.c new file mode 100644 index 000000000..56881ebde --- /dev/null +++ b/coreblas/compute/core_zgram.c @@ -0,0 +1,141 @@ +/** + * + * @file core_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgram CPU kernel + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "coreblas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zgram computes the Gram matrix factors of A inplace + * + * \f[ + * d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\ + * A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2) + * \f] + * + ******************************************************************************* + * + * @param[in] uplo + * Specifies the part of the matrix A to be computed. + * = ChamUpperLower: All the matrix A + * = ChamUpper: Upper triangular part + * = ChamLower: Lower triangular part + * + * @param[in] M + * The number of rows of the overall matrix. M >= 0. + * + * @param[in] N + * The number of columns of the overall matrix. N >= 0. + * + * @param[in] Mt + * The number of rows of the tile A. Mt >= 0. + * + * @param[in] Nt + * The number of columns of the tile A. Nt >= 0. + * + * @param[in] Di + * The 1-by-Mt tile containing the sum of squares by rows. + * + * @param[in] LDDI + * The leading dimension of the array Di. + * + * @param[in] Dj + * The 1-by-Nt tile containing the sum of squares by columns. + * + * @param[in] LDDJ + * The leading dimension of the array Dj. + * + * @param[in] D + * The sum of squares of all the matrix. + * + * @param[in,out] A + * On entry, the M-by-N tile A. + * On exit, updated by the application of L. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if INFO = -k, the k-th argument had an illegal value + * + */ + +int CORE_zgram( cham_uplo_t uplo, + int M, int N, + int Mt, int Nt, + const double *Di, int LDDI, + const double *Dj, int LDDJ, + const double *D, + double *A, int LDA ) +{ + int i, j; + double coef = -0.5; + double di, dj, d; + + /* Check input arguments */ + if (uplo != ChamUpper && uplo != ChamLower && uplo != ChamUpperLower ) { + coreblas_error(1, "Illegal value of uplo"); + return -1; + } + if (M < 0) { + coreblas_error(2, "Illegal value of M"); + return -2; + } + if (N < 0) { + coreblas_error(3, "Illegal value of N"); + return -3; + } + if (Mt < 0) { + coreblas_error(4, "Illegal value of Mt"); + return -4; + } + if (Nt < 0) { + coreblas_error(5, "Illegal value of Nt"); + return -5; + } + if ((LDA < chameleon_max(1,Mt)) && (Mt > 0)) { + coreblas_error(12, "Illegal value of LDA"); + return -12; + } + + /* Quick return */ + if ((Mt == 0) || (Nt == 0)) + return CHAMELEON_SUCCESS; + + /* overall mean of squares */ + d = D[0]*D[0]/(M*N); + + for(j = 0; j < Nt; j++) { + /* mean of squares of the column */ + dj = Dj[j*LDDJ]*Dj[j*LDDJ]/M; + int mmin = ( uplo == ChamLower ) ? j : 0; + int mmax = ( uplo == ChamUpper) ? chameleon_min(j+1, Mt) : Mt; + for(i = mmin; i < mmax; i++) { + /* mean of squares of the row */ + di = Di[i*LDDI]*Di[i*LDDI]/N; + /* compute Gram factor */ + A[j*LDA+i] = coef*(A[j*LDA+i]*A[j*LDA+i] - di - dj + d); + } + } + + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_zplssq.c b/coreblas/compute/core_zplssq.c index 9549453dc..4e8d8750d 100644 --- a/coreblas/compute/core_zplssq.c +++ b/coreblas/compute/core_zplssq.c @@ -41,7 +41,7 @@ CORE_zplssq_col( int M, int N, for(j=0; j= 0.) { if ( (*scaleout) < (*scalein) ) { @@ -114,7 +114,7 @@ sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *sc } } #elif defined(PRECISION_s) || defined(PRECISION_c) -sumsq_update_2( int nb, const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout ) +sumsq_update_2( const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout ) { if (*scaleout >= 0.) { if ( (*scaleout) < (*scalein) ) { diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index ce3070d1c..60b04f38b 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -331,6 +331,13 @@ int CHAMELEON_zbuild(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, i int CHAMELEON_zbuild_Tile(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback ); int CHAMELEON_zbuild_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +/** + * Gram function prototypes + */ +int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA ); +int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); +int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); + END_C_DECLS #endif /* _chameleon_z_h_ */ diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 2d8ec9882..10b43a5aa 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -418,4 +418,15 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options, A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); } +/** + * Gram prototype + */ +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index e184ca536..879d3cee2 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -94,6 +94,10 @@ set(CODELETS_ZSRC # BUILD ################## codelets/codelet_zbuild.c + ################## + # GRAM + ################## + codelets/codelet_zgram.c ) set(CODELETS_SRC diff --git a/runtime/openmp/codelets/codelet_zgram.c b/runtime/openmp/codelets/codelet_zgram.c new file mode 100644 index 000000000..125057333 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgram.c @@ -0,0 +1,43 @@ +/** + * + * @file openmp/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram OpenMP codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ + +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + double *ptrDi = RTBLKADDR(Di, double, Dim, Din); + double *ptrDj = RTBLKADDR(Dj, double, Djm, Djn); + double *ptrD = RTBLKADDR(D, double, Dm, Dn); + double *ptrA = RTBLKADDR(A, double, Am, An); + +#pragma omp task firstprivate(uplo, m, n, mt, nt, ptrDi, lddi, ptrDj, lddj, ptrD, ptrA, lda) depend(in:ptrDi[0], ptrDj[0], ptrD[0]) depend(inout:ptrA[0]) + CORE_zgram( uplo, + m, n, mt, nt, + ptrDi, lddi, + ptrDj, lddj, + ptrD, + ptrA, lda); +} diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c index 61196102c..ad59a3eb6 100644 --- a/runtime/openmp/codelets/codelet_zplssq.c +++ b/runtime/openmp/codelets/codelet_zplssq.c @@ -39,7 +39,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { - CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); + double *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); #pragma omp task firstprivate(N) depend(inout: res[0]) CORE_zplssq2(N, res); diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c index 93b332cbf..4cd62d842 100644 --- a/runtime/parsec/codelets/codelet_zgessq.c +++ b/runtime/parsec/codelets/codelet_zgessq.c @@ -55,6 +55,6 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, sizeof(int), &lda, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zgram.c b/runtime/parsec/codelets/codelet_zgram.c new file mode 100644 index 000000000..8d0217cc2 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgram.c @@ -0,0 +1,131 @@ +/** + * + * @file parsec/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram PaRSEC codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +static inline int +CORE_zgegram_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + parsec_dtd_unpack_args( + this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &Dj, &lddj, &D, &A, &lda ); + + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +static inline int +CORE_zsygram_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *D; + double *A; + int lda; + + parsec_dtd_unpack_args( + this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &D, &A, &lda ); + + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Di, lddi, + D, + A, lda); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + double *ptrDi, *ptrDj; + + /* + * Test if Di is Dj, when we are on the diagonal. + * This to avoid having the same data twice in inputs (not handled in parsec). + */ + ptrDi = (double *)(RTBLKADDR( Di, double, Dim, Din )); + ptrDj = (double *)(RTBLKADDR( Dj, double, Djm, Djn )); + if (ptrDi == ptrDj) { + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zsygram_parsec, options->priority, "sygram", + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, + sizeof(int), &lddi, VALUE, + PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, + sizeof(int), &lda, VALUE, + PARSEC_DTD_ARG_END ); + } else { + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zgegram_parsec, options->priority, "gegram", + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, + sizeof(int), &lddi, VALUE, + PASSED_BY_REF, RTBLKADDR( Dj, double, Djm, Djn ), chameleon_parsec_get_arena_index( Dj ) | INPUT, + sizeof(int), &lddj, VALUE, + PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, + sizeof(int), &lda, VALUE, + PARSEC_DTD_ARG_END ); + } + +} diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c index 16f19a21f..96ef96dd5 100644 --- a/runtime/parsec/codelets/codelet_zplssq.c +++ b/runtime/parsec/codelets/codelet_zplssq.c @@ -53,8 +53,8 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, sizeof(int), &storev, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INPUT, - PASSED_BY_REF, RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ) | INPUT, + PASSED_BY_REF, RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), chameleon_parsec_get_arena_index( SCLSSQ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } @@ -82,6 +82,6 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplssq2_parsec, options->priority, "plssq2", sizeof(int), &N, VALUE, - PASSED_BY_REF, RTBLKADDR( RESULT, double, RESULTm, RESULTn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( RESULT, double, RESULTm, RESULTn ), chameleon_parsec_get_arena_index( RESULT) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c index 8e86035cc..d3797ca26 100644 --- a/runtime/parsec/codelets/codelet_zsyssq.c +++ b/runtime/parsec/codelets/codelet_zsyssq.c @@ -55,6 +55,6 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, sizeof(int), &lda, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c index bf5396e42..a0f343091 100644 --- a/runtime/quark/codelets/codelet_zgessq.c +++ b/runtime/quark/codelets/codelet_zgessq.c @@ -41,13 +41,24 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { + int sizessq; + + if ( storev == ChamColumnwise ) { + sizessq = 2*n; + } else if ( storev == ChamRowwise ) { + sizessq = 2*m; + } else { + sizessq = 2; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_GESSQ; QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(CHAMELEON_Complex64_t)*m*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zgram.c b/runtime/quark/codelets/codelet_zgram.c new file mode 100644 index 000000000..f2c4228ec --- /dev/null +++ b/runtime/quark/codelets/codelet_zgram.c @@ -0,0 +1,68 @@ +/** + * + * @file quark/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram Quark codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-16 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +void CORE_zgram_quark(Quark *quark) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + quark_unpack_args_12(quark, uplo, m, n, mt, nt, Di, lddi, Dj, lddj, D, A, lda); + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); +} + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_GRAM; + QUARK_Insert_Task(opt->quark, CORE_zgram_quark, (Quark_Task_Flags*)opt, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + sizeof(double)*lddi*mt, RTBLKADDR(Di, double, Dim, Din), INPUT, + sizeof(int), &lddi, VALUE, + sizeof(double)*lddj*nt, RTBLKADDR(Dj, double, Djm, Djn), INPUT, + sizeof(int), &lddj, VALUE, + sizeof(double)*2, RTBLKADDR(D, double, Dm, Dn), INPUT, + sizeof(double)*mt*nt, RTBLKADDR(A, double, Am, An), INOUT, + sizeof(int), &lda, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c index 70810682c..b340873af 100644 --- a/runtime/quark/codelets/codelet_zplssq.c +++ b/runtime/quark/codelets/codelet_zplssq.c @@ -73,7 +73,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) { - int sizein = M*N; + int sizein = 2*M*N; int sizeout; if ( storev == ChamColumnwise ) { @@ -85,6 +85,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, } quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_PLSSQ; QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt, sizeof(int), &storev, VALUE, sizeof(int), &M, VALUE, @@ -109,6 +110,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_PLSSQ2; QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt, sizeof(int), &N, VALUE, sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c index cf1f39442..913424820 100644 --- a/runtime/quark/codelets/codelet_zsyssq.c +++ b/runtime/quark/codelets/codelet_zsyssq.c @@ -41,13 +41,22 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { + int sizessq; + + if ( storev == ChamEltwise ) { + sizessq = 2; + } else { + sizessq = 2*n; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_SYSSQ; QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(CHAMELEON_Complex64_t)*n*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrssq.c b/runtime/quark/codelets/codelet_ztrssq.c index 92cac805f..80bb9e78c 100644 --- a/runtime/quark/codelets/codelet_ztrssq.c +++ b/runtime/quark/codelets/codelet_ztrssq.c @@ -44,6 +44,7 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_TRSSQ; QUARK_Insert_Task(opt->quark, CORE_ztrssq_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, sizeof(int), &diag, VALUE, diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h index 83bdbd531..acc7ca8a1 100644 --- a/runtime/quark/include/core_blas_dag.h +++ b/runtime/quark/include/core_blas_dag.h @@ -90,4 +90,12 @@ #define DAG_CORE_TTMQR DAG_CORE_TPMQRT #define DAG_CORE_TTQRT DAG_CORE_TPQRT +#define DAG_CORE_GESSQ DAG_SET_PROPERTIES( "GESSQ" , "white" ) +#define DAG_CORE_HESSQ DAG_SET_PROPERTIES( "HESSQ" , "white" ) +#define DAG_CORE_SYSSQ DAG_SET_PROPERTIES( "SYSSQ" , "white" ) +#define DAG_CORE_TRSSQ DAG_SET_PROPERTIES( "TRSSQ" , "white" ) +#define DAG_CORE_PLSSQ DAG_SET_PROPERTIES( "PLSSQ" , "white" ) +#define DAG_CORE_PLSSQ2 DAG_SET_PROPERTIES( "PLSSQ2" , "white" ) +#define DAG_CORE_GRAM DAG_SET_PROPERTIES( "GRAM" , "orange" ) + #endif /* _core_blas_dag_h_ */ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index dddcb6994..44d431b7d 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -22,7 +22,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -CHAMELEON_CL_CB(dzasum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(dzasum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zaxpy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[1]), 0, M) CHAMELEON_CL_CB(zgeadd, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlascal, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) @@ -34,6 +34,7 @@ CHAMELEON_CL_CB(zgessq, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(zgetrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_incpiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_nopiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgram, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), 0, M*N) CHAMELEON_CL_CB(zhe2ge, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (1./2.0)*M*N) CHAMELEON_CL_CB(zherfb, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2. *M* M*M) #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/runtime/starpu/codelets/codelet_zgram.c b/runtime/starpu/codelets/codelet_zgram.c new file mode 100644 index 000000000..56af2b407 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgram.c @@ -0,0 +1,92 @@ +/** + * + * @file starpu/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram StarPU codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-16 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgram_cpu_func(void *descr[], void *cl_arg) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + Di = (double *)STARPU_MATRIX_GET_PTR(descr[0]); + Dj = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + D = (double *)STARPU_MATRIX_GET_PTR(descr[2]); + A = (double *)STARPU_MATRIX_GET_PTR(descr[3]); + starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n, &mt, &nt, &lddi, &lddj, &lda); + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgram, 4, cl_zgram_cpu_func) + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + struct starpu_codelet *codelet = &cl_zgram; + void (*callback)(void*) = options->profiling ? cl_zgram_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(Di, Dim, Din); + CHAMELEON_ACCESS_R(Dj, Djm, Djn); + CHAMELEON_ACCESS_R(D, Dm, Dn); + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &uplo, sizeof(int), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &mt, sizeof(int), + STARPU_VALUE, &nt, sizeof(int), + STARPU_R, RTBLKADDR(Di, double, Dim, Din), + STARPU_VALUE, &lddi, sizeof(int), + STARPU_R, RTBLKADDR(Dj, double, Djm, Djn), + STARPU_VALUE, &lddj, sizeof(int), + STARPU_R, RTBLKADDR(D, double, Dm, Dn), + STARPU_RW, RTBLKADDR(A, double, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgram", +#endif + 0); +} diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 4936d329c..c92f3a658 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -112,6 +112,7 @@ CODELETS_HEADER(dzasum) */ CODELETS_HEADER(zplrnt) CODELETS_HEADER(zbuild) +CODELETS_HEADER(zgram) #if defined(PRECISION_z) || defined(PRECISION_c) CODELETS_HEADER(zhessq) diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 40b853364..4ffbae9ea 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -94,6 +94,7 @@ set(ZSRC #testing_zhegv.c #testing_zhegvd.c testing_zgeqrf_qdwh.c + testing_dgram.c ) # Add include and link directories diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 4fc8f3cf2..a5cbc1710 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -66,3 +66,11 @@ endforeach() # add_test(test_mpi_${prec}lange mpirun -np 4 ./${prec}testing 1 0 LANGE 600 500 600 --p=2) # endforeach() #endif() + +# Specific algorithms +# Gram +foreach(cat ${TEST_CATEGORIES}) + foreach(prec s;d) + add_test(test_${cat}_${prec}gram ./${prec}${TEST_CMD_${cat}} GRAM 117 213 ) + endforeach() +endforeach() \ No newline at end of file diff --git a/testing/testing_dgram.c b/testing/testing_dgram.c new file mode 100644 index 000000000..c9ca38a3f --- /dev/null +++ b/testing/testing_dgram.c @@ -0,0 +1,261 @@ +/** + * + * @file testing_dgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dgram testing + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-12 + * @precisions normal d -> d s + * + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include "testing_zauxiliary.h" +#if defined(CHAMELEON_USE_MPI) +#include +#endif + +static int check_solution(cham_uplo_t uplo, + int N, + double *Aref, + double *Acham, int LDA); +static int compute_gram_sequential(cham_uplo_t uplo, + int N, + double *A, + int LDA); + +int testing_dgram(int argc, char **argv) +{ + int hres = 0; + /* Check for number of arguments */ + if ( argc < 2) { + USAGE("GRAM", "N LDA", + " - N : number of rows of matrix A\n" + " - LDA : leading dimension of matrix A\n"); + return -1; + } + int N = atoi(argv[0]); + int LDA = atoi(argv[1]); + + double eps; + int info_solution; + int i, j, ua; + int LDAxN = LDA*N; + + double *A = (double *)malloc(LDAxN*sizeof(double)); + double *Aref = (double *)malloc(LDAxN*sizeof(double)); + double *Acham = (double *)malloc(LDAxN*sizeof(double)); + + /* Check if unable to allocate memory */ + if ( (!A) || (!Aref) || (!Acham) ) + { + free(A); free(Aref); free(Acham); + printf("Out of Memory \n "); + return -2; + } + + eps = LAPACKE_dlamch_work('e'); + + if (CHAMELEON_My_Mpi_Rank() == 0){ + printf("\n"); + printf("------ TESTS FOR CHAMELEON GRAM ROUTINE ------- \n"); + printf(" Size of the Matrix %d by %d\n", N, N); + printf("\n"); + printf(" The matrix A is randomly generated for each test.\n"); + printf("============\n"); + printf(" The relative machine precision (eps) is to be %e \n",eps); + printf(" Computational tests pass if scaled residuals are less than 10.\n"); + } + + /*---------------------------------------------------------- + * TESTING GRAM + */ + + /* Initialize A such that it is symmetric */ + CHAMELEON_dplgsy( (double)N, ChamUpperLower, N, A, LDA, 51 ); + /* Gram is meant to be used with A full of positive values only */ +#if defined(PRECISION_d) || defined(PRECISION_s) + for (i=0; i 10.0) ) { + if (CHAMELEON_My_Mpi_Rank() == 0) + printf("-- The solution is suspicious ! \n"); + info_solution = 1; + } + else { + if (CHAMELEON_My_Mpi_Rank() == 0) + printf("-- The solution is CORRECT ! \n"); + info_solution= 0 ; + } + + free(work); + + return info_solution; +} + +/*-------------------------------------------------------------- + * Compute the Gram matrix sequentially + * We consider the matrix on entry as symmetric + */ +static int compute_gram_sequential(cham_uplo_t uplo, + int N, + double *A, + int LDA) +{ + int m, n; + double eps; + double squareij, mean_dij, mhalf; + + double *work = (double *)malloc(N * sizeof(double)); + + mhalf = -0.5; + + /* initialize work */ + memset(work, 0., N*sizeof(double)); + + /* first: compute the means of squares */ + for (n=0; n