diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 4608e59b8a50b4ed692ab9e62e7d17ac47b4a9ed..51fcb8ec288d1acc4162d3ca5a8d77ae25f6bb56 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -14,6 +14,7 @@ _extra_blas = [ ('', 'she2ge', 'dhe2ge', 'che2ge', 'zhe2ge' ), ('', 'slatro', 'dlatro', 'clatro', 'zlatro' ), #=> Replace by getmo/gecmo as in essl ('', 'sbuild', 'dbuild', 'cbuild', 'zbuild' ), #=> Replace by map function + ('', 'sgram', 'dgram', 'cgram', 'zgram' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 96493134cf99cb919e819f4c7543e4dff94b2370..3ca2aaaa1adc53980cc7d8496719df735decbc52 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -242,6 +242,11 @@ set(ZSRC ################## zbuild.c pzbuild.c + ################## + # Gram + ################## + zgram.c + pzgram.c ) precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}" diff --git a/compute/pzgram.c b/compute/pzgram.c new file mode 100644 index 0000000000000000000000000000000000000000..c8fc0b34e4e38043bfc8468c747fa0f2b1fd5a0f --- /dev/null +++ b/compute/pzgram.c @@ -0,0 +1,209 @@ +/** + * + * @file pzgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram parallel algorithm + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" + +#define A( m, n ) A, (m), (n) +#define W( desc, m, n ) (desc), (m), (n) + +static inline void +chameleon_pzgram_internal( cham_uplo_t uplo, + CHAM_desc_t *A, + CHAM_desc_t *Wcol, + CHAM_desc_t *Welt, + RUNTIME_option_t *options ) +{ + int m, n; + int MT = A->mt; + int NT = A->nt; + int M = A->m; + int N = A->n; + int P = Welt->p; + int Q = Welt->q; + + /** + * 1) compute (scl,ssq) over columns in each tile + */ + for(n = 0; n < NT; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT; + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + for(m = mmin; m < mmax; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + int ldam = BLKLDD( A, m ); + + if ( n == m ) { + INSERT_TASK_dsyssq( + options, ChamColumnwise, uplo, tempmm, + A(m, n), ldam, W( Wcol, m, n) ); + } + else { + INSERT_TASK_dgessq( + options, ChamColumnwise, tempmm, tempnn, + A(m, n), ldam, W( Wcol, m, n) ); + if ( uplo != ChamUpperLower ) { + INSERT_TASK_dgessq( + options, ChamRowwise, tempmm, tempnn, + A(m, n), ldam, W( Wcol, n, m) ); + } + } + } + } + + for(n = 0; n < NT; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + /** + * 2) reduce columns (scl,ssq) tiles per processus (between lines) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wcol, m, n ), + W( Wcol, m%P, n ) ); + } + + /** + * 3) reduce columns (scl,ssq) tiles on the first line of tiles + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wcol, m, n ), + W( Wcol, 0, n ) ); + } + + /* 4) reduce (scl,ssq) inside each tile of the first line of tiles for the global sum square */ + INSERT_TASK_dplssq( + options, ChamEltwise, 1, tempnn, + W( Wcol, 0, n ), + W( Welt, 0, n ) ); + + /* 5) deduce the sum square for each column from the pairs (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, tempnn, W( Wcol, 0, n ) ); + } + + /* 6) reduce global sum squares on each processus (between columns) */ + for(n = Q; n < NT; n++) { + INSERT_TASK_dplssq( options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, n%Q) ); + } + + /* 7) reduce global sum squares on the first tile (index 0, 0) */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, 0) ); + } + + /* 8) deduce the global sum square from the pair (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, 1, W( Welt, 0, 0) ); + + /* Finally compute Gram matrix coefficients inplace */ + for(n = 0; n < NT; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT; + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + for(m = mmin; m < mmax; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + int ldam = BLKLDD( A, m ); + + INSERT_TASK_zgram( + options, + ( m == n ) ? uplo : ChamUpperLower, + A->m, A->n, tempmm, tempnn, + W( Wcol, 0, m ), 2, + W( Wcol, 0, n ), 2, + W( Welt, 0, 0 ), + A( m, n ), ldam ); + } + } +} + +/** + * + */ +void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t Wcol; + CHAM_desc_t Welt; + + int workmt, worknt; + int m, n, tempmm, tempnn, ldw; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + workmt = chameleon_max( A->mt, A->p ); + worknt = chameleon_max( A->nt, A->q ); + + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, A->nb, 2*A->nb, + 2*workmt, A->n, 0, 0, 2*workmt, A->n, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, 1, 2, + 2, worknt, 0, 0, 2, worknt, A->p, A->q, + NULL, NULL, NULL ); + + /* Initialize Wcol */ + for(m = 0; m < Wcol.mt; m++) { + tempmm = m == Wcol.mt-1 ? Wcol.m-m*Wcol.mb : Wcol.mb; + ldw = Wcol.get_blkldd(&Wcol, m); + for(n = 0; n < Wcol.nt; n++) { + tempnn = n == Wcol.nt-1 ? Wcol.n-n*Wcol.nb : Wcol.nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( &Wcol, m, n ), ldw ); + } + } + /* Initialize Welt */ + for(m = 0; m < Welt.mt; m++) { + tempmm = m == Welt.mt-1 ? Welt.m-m*Welt.mb : Welt.mb; + ldw = Welt.get_blkldd(&Welt, m); + for(n = 0; n < Welt.nt; n++) { + tempnn = n == Welt.nt-1 ? Welt.n-n*Welt.nb : Welt.nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( &Welt, m, n ), ldw ); + } + } + + chameleon_pzgram_internal( uplo, A, &Wcol, &Welt, &options ); + + CHAMELEON_Desc_Flush( &Wcol, sequence ); + CHAMELEON_Desc_Flush( &Welt, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + + chameleon_desc_destroy( &Wcol ); + chameleon_desc_destroy( &Welt ); + + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zgram.c b/compute/zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..602856a52cc43e427a377cef676135db1e9959b1 --- /dev/null +++ b/compute/zgram.c @@ -0,0 +1,236 @@ +/** + * + * @file zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram wrappers + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * CHAMELEON_zgram replace a general matrix by the Gram matrix inplace + * + * \f[ + * d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\ + * A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2) + * \f] + * + ******************************************************************************* + * + * @param[in] N + * The number of lines and columns of the matrix A. N >= 0. When N = 0, + * the returned value is set to zero. + * + * @param[in] A + * The N-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + ******************************************************************************* + * +* @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram_Tile + * @sa CHAMELEON_zgram_Tile_Async + * @sa CHAMELEON_sgram + * + */ +int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA ) +{ + int NB; + int status; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (N < 0) { + chameleon_error("CHAMELEON_zgram", "illegal value of N"); + return -1; + } + if (LDA < chameleon_max(1, N)) { + chameleon_error("CHAMELEON_zgram", "illegal value of LDA"); + return -3; + } + + /* Quick return */ + if (N == 0) + return CHAMELEON_SUCCESS; + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_DGEMM, N, N, 0); + if (status != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgram", "chameleon_tune() failed"); + return status; + } + + /* Set NT */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, uplo, + A, NB, NB, LDA, N, N, N, sequence, &request ); + + /* Call the tile interface */ + CHAMELEON_zgram_Tile_Async( uplo, &descAt, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInout, uplo, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zgram_Tile - Tile equivalent of CHAMELEON_zgram(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] A + * The N-by-N matrix A. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram + * @sa CHAMELEON_zgram_Tile_Async + * @sa CHAMELEON_sgram_Tile + * + */ +int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgram_Tile_Async( uplo, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * CHAMELEON_zgram_Tile_Async - Non-blocking equivalent of CHAMELEON_zgram_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgram + * @sa CHAMELEON_zgram_Tile + * @sa CHAMELEON_sgram_Tile_Async + * + */ +int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgram_Tile", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgram_Tile", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Quick return */ + if (chameleon_min(A->m, A->n) == 0) { + return CHAMELEON_SUCCESS; + } + + chameleon_pzgram( uplo, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index e34ec4dfc7c4b37c72a6ad267c0ee09fec9da19c..196fde2047bf7ac40676bd383f63c8f9010405e3 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -115,7 +115,10 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); - +/** + * Gram function prototypes + */ +void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); /** * LAPACK/Tile Descriptor accesses diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index d44a83fd7a2480d0c5197f4b4d4477aebfa4be16..283659eed45ee323aed7c10ff5479c257c442dc9 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -44,6 +44,7 @@ set(ZSRC core_zgetrf.c core_zgetrf_incpiv.c core_zgetrf_nopiv.c + core_zgram.c core_zhe2ge.c core_zherfb.c core_zhemm.c diff --git a/coreblas/compute/core_zgessq.c b/coreblas/compute/core_zgessq.c index 71aa407bdebde121a438604a3d08a9145b802e2f..8b02bb10ae9a69b8e5742f176a26dd2ea9142d1f 100644 --- a/coreblas/compute/core_zgessq.c +++ b/coreblas/compute/core_zgessq.c @@ -174,6 +174,24 @@ int CORE_zgessq( cham_store_t storev, int M, int N, const CHAMELEON_Complex64_t *A, int LDA, double *sclssq ) { + int i; + int K; + + /* initialize pairs scale, sumsquare if not already done */ + if ( storev == ChamColumnwise ) { + K = N; + } else if ( storev == ChamRowwise ) { + K = M; + } else { + K = 1; + } + for (i=0; i<2*K; i+=2) { + if ( ( sclssq[i] == -1. ) && ( sclssq[i+1] == -1. ) ) { + sclssq[i] = 1.; + sclssq[i+1] = 0.; + } + } + if (storev == ChamColumnwise) { CORE_zgessq_col( M, N, A, LDA, sclssq ); } diff --git a/coreblas/compute/core_zgram.c b/coreblas/compute/core_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..56881ebde58a0514503caacaf74394eb0080c1e0 --- /dev/null +++ b/coreblas/compute/core_zgram.c @@ -0,0 +1,141 @@ +/** + * + * @file core_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgram CPU kernel + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ +#include "coreblas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zgram computes the Gram matrix factors of A inplace + * + * \f[ + * d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\ + * A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2) + * \f] + * + ******************************************************************************* + * + * @param[in] uplo + * Specifies the part of the matrix A to be computed. + * = ChamUpperLower: All the matrix A + * = ChamUpper: Upper triangular part + * = ChamLower: Lower triangular part + * + * @param[in] M + * The number of rows of the overall matrix. M >= 0. + * + * @param[in] N + * The number of columns of the overall matrix. N >= 0. + * + * @param[in] Mt + * The number of rows of the tile A. Mt >= 0. + * + * @param[in] Nt + * The number of columns of the tile A. Nt >= 0. + * + * @param[in] Di + * The 1-by-Mt tile containing the sum of squares by rows. + * + * @param[in] LDDI + * The leading dimension of the array Di. + * + * @param[in] Dj + * The 1-by-Nt tile containing the sum of squares by columns. + * + * @param[in] LDDJ + * The leading dimension of the array Dj. + * + * @param[in] D + * The sum of squares of all the matrix. + * + * @param[in,out] A + * On entry, the M-by-N tile A. + * On exit, updated by the application of L. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if INFO = -k, the k-th argument had an illegal value + * + */ + +int CORE_zgram( cham_uplo_t uplo, + int M, int N, + int Mt, int Nt, + const double *Di, int LDDI, + const double *Dj, int LDDJ, + const double *D, + double *A, int LDA ) +{ + int i, j; + double coef = -0.5; + double di, dj, d; + + /* Check input arguments */ + if (uplo != ChamUpper && uplo != ChamLower && uplo != ChamUpperLower ) { + coreblas_error(1, "Illegal value of uplo"); + return -1; + } + if (M < 0) { + coreblas_error(2, "Illegal value of M"); + return -2; + } + if (N < 0) { + coreblas_error(3, "Illegal value of N"); + return -3; + } + if (Mt < 0) { + coreblas_error(4, "Illegal value of Mt"); + return -4; + } + if (Nt < 0) { + coreblas_error(5, "Illegal value of Nt"); + return -5; + } + if ((LDA < chameleon_max(1,Mt)) && (Mt > 0)) { + coreblas_error(12, "Illegal value of LDA"); + return -12; + } + + /* Quick return */ + if ((Mt == 0) || (Nt == 0)) + return CHAMELEON_SUCCESS; + + /* overall mean of squares */ + d = D[0]*D[0]/(M*N); + + for(j = 0; j < Nt; j++) { + /* mean of squares of the column */ + dj = Dj[j*LDDJ]*Dj[j*LDDJ]/M; + int mmin = ( uplo == ChamLower ) ? j : 0; + int mmax = ( uplo == ChamUpper) ? chameleon_min(j+1, Mt) : Mt; + for(i = mmin; i < mmax; i++) { + /* mean of squares of the row */ + di = Di[i*LDDI]*Di[i*LDDI]/N; + /* compute Gram factor */ + A[j*LDA+i] = coef*(A[j*LDA+i]*A[j*LDA+i] - di - dj + d); + } + } + + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_zplssq.c b/coreblas/compute/core_zplssq.c index 9549453dcd50df72fc2780eadc80d81ceeac9a83..4e8d8750d4dd58457cf72004e82860b3821d82a1 100644 --- a/coreblas/compute/core_zplssq.c +++ b/coreblas/compute/core_zplssq.c @@ -41,7 +41,7 @@ CORE_zplssq_col( int M, int N, for(j=0; j<N; j++) { for(i=0; i<M; i++) { - sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); tmpScaleIn+=2; tmpSumsqIn+=2; } @@ -68,7 +68,7 @@ CORE_zplssq_row( int M, int N, tmpScaleOut = sclssqout; tmpSumsqOut = sclssqout+1; for(i=0; i<M; i++) { - sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); tmpScaleIn+=2; tmpSumsqIn+=2; tmpScaleOut+=2; @@ -95,7 +95,7 @@ CORE_zplssq_elt( int M, int N, for(j=0; j<N; j++) { for(i=0; i<M; i++) { - sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); tmpScaleIn+=2; tmpSumsqIn+=2; } @@ -147,6 +147,24 @@ int CORE_zplssq( cham_store_t storev, int M, int N, double *sclssqin, double *sclssqout ) { + int i; + int K; + + /* initialize pairs scale, sumsquare if not already done */ + if ( storev == ChamColumnwise ) { + K = N; + } else if ( storev == ChamRowwise ) { + K = M; + } else { + K = 1; + } + for (i=0; i<2*K; i+=2) { + if ( ( sclssqout[i] == -1. ) && ( sclssqout[i+1] == -1. ) ) { + sclssqout[i] = 1.; + sclssqout[i+1] = 0.; + } + } + if (storev == ChamColumnwise) { CORE_zplssq_col( M, N, sclssqin, sclssqout ); } @@ -173,7 +191,7 @@ int CORE_zplssq( cham_store_t storev, int M, int N, * @param[in,out] sclssq * The 2*N matrix on which to compute scl*sqrt(ssq) * On entry contains all couple (scl,ssq) in (sclssq[i],sclssq[i+1]) - * On exist returns sclssq[i]=sclssq[i]*sqrt(sclssq[i+1]) + * On exist returns scl*sqrt(ssq) stored in sclssq[2*i], i = 0, ..., N-1 * so that the result is stored in the first line. * ******************************************************************************* @@ -186,8 +204,8 @@ int CORE_zplssq2( int N, double *sclssq ) { int i; - for (i=0; i<N; i+=2) { - sclssq[i] *= sqrt ( sclssq[i+1] ); + for (i=0; i<2*N; i+=2) { + sclssq[i] = sclssq[i]*sqrt(sclssq[i+1]); } return CHAMELEON_SUCCESS; } diff --git a/coreblas/compute/core_zsyssq.c b/coreblas/compute/core_zsyssq.c index dfad10e3e6d90b61e737c6a283452c791c37bea6..0f194cfeb380390ee2aba5d19e8bbd77405aa560 100644 --- a/coreblas/compute/core_zsyssq.c +++ b/coreblas/compute/core_zsyssq.c @@ -244,6 +244,20 @@ int CORE_zsyssq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAMELEON_Complex64_t *A, int LDA, double *sclssq ) { + int i; + int K = N; + + /* Initialize pairs scale, sumsquare if not already done */ + if ( storev == ChamEltwise ) { + K = 1; + } + for (i=0; i<2*K; i+=2) { + if ( ( sclssq[i] == -1. ) && ( sclssq[i+1] == -1. ) ) { + sclssq[i] = 1.; + sclssq[i+1] = 0.; + } + } + if ( uplo == ChamUpper ) { if ( storev == ChamEltwise ) { CORE_zsyssq_up_elt( N, A, LDA, sclssq ); diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 2bd8c01067594c0c85ee2d43aeb6432d5b9ba031..ee9e3e0d88269259812b5b97f2c5f8a4c9d40972 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -360,4 +360,13 @@ int CORE_zunmqr(cham_side_t side, cham_trans_t trans, CHAMELEON_Complex64_t *C, int LDC, CHAMELEON_Complex64_t *WORK, int LDWORK); +/** + * Gram prototypes + */ +int CORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt, + const double *Di, int LDDI, + const double *Dj, int LDDJ, + const double *D, + double *A, int LDA ); + #endif /* _coreblas_z_h_ */ diff --git a/coreblas/include/coreblas/sumsq_update.h b/coreblas/include/coreblas/sumsq_update.h index 1c94e85ba13d540c5b72ad3277937829b9375c5b..c5312e9e3a084d94ed589f37b5e8842cf208038b 100644 --- a/coreblas/include/coreblas/sumsq_update.h +++ b/coreblas/include/coreblas/sumsq_update.h @@ -79,7 +79,7 @@ sumsq_update( int nb, float *scale, float *sumsq, const float *value ) ******************************************************************************* * * @brief Update the couple (scale, sumsq) by adding another couple when - * computing the Froebnius norm. + * computing the Froebenius norm. * * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to * avoid overflow in the sum square computation. @@ -100,7 +100,7 @@ sumsq_update( int nb, float *scale, float *sumsq, const float *value ) *******************************************************************************/ static inline void #if defined(PRECISION_d) || defined(PRECISION_z) -sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout ) +sumsq_update_2( const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout ) { if (*scaleout >= 0.) { if ( (*scaleout) < (*scalein) ) { @@ -114,7 +114,7 @@ sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *sc } } #elif defined(PRECISION_s) || defined(PRECISION_c) -sumsq_update_2( int nb, const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout ) +sumsq_update_2( const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout ) { if (*scaleout >= 0.) { if ( (*scaleout) < (*scalein) ) { diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index ce3070d1c1c989f7dea1a7446e49198b484a3a71..60b04f38b92e36752588a2d30f2d6eeeb91408ca 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -331,6 +331,13 @@ int CHAMELEON_zbuild(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, i int CHAMELEON_zbuild_Tile(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback ); int CHAMELEON_zbuild_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +/** + * Gram function prototypes + */ +int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA ); +int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); +int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); + END_C_DECLS #endif /* _chameleon_z_h_ */ diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 2d8ec988208595f60f34b9355fa6f192ba89db6b..10b43a5aa05c1eada9bd384d57882c701fdd153e 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -418,4 +418,15 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options, A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); } +/** + * Gram prototype + */ +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index e184ca5362b0f2b96519b20d6211fbb0b17f6d19..879d3cee22e8efa6d7cb3d17b4f8a31a3e85adad 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -94,6 +94,10 @@ set(CODELETS_ZSRC # BUILD ################## codelets/codelet_zbuild.c + ################## + # GRAM + ################## + codelets/codelet_zgram.c ) set(CODELETS_SRC diff --git a/runtime/openmp/codelets/codelet_zgram.c b/runtime/openmp/codelets/codelet_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..1250573332538d53bf201c4b82fb7e57ec2aa1c3 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgram.c @@ -0,0 +1,43 @@ +/** + * + * @file openmp/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram OpenMP codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-10 + * @precisions normal z -> s d c z + * + */ + +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + double *ptrDi = RTBLKADDR(Di, double, Dim, Din); + double *ptrDj = RTBLKADDR(Dj, double, Djm, Djn); + double *ptrD = RTBLKADDR(D, double, Dm, Dn); + double *ptrA = RTBLKADDR(A, double, Am, An); + +#pragma omp task firstprivate(uplo, m, n, mt, nt, ptrDi, lddi, ptrDj, lddj, ptrD, ptrA, lda) depend(in:ptrDi[0], ptrDj[0], ptrD[0]) depend(inout:ptrA[0]) + CORE_zgram( uplo, + m, n, mt, nt, + ptrDi, lddi, + ptrDj, lddj, + ptrD, + ptrA, lda); +} diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c index 61196102c2d1ba7646925bb4ff27c2e858e08dd2..ad59a3eb6955f386297753e19bc79a9a12f32bc0 100644 --- a/runtime/openmp/codelets/codelet_zplssq.c +++ b/runtime/openmp/codelets/codelet_zplssq.c @@ -39,7 +39,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { - CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); + double *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); #pragma omp task firstprivate(N) depend(inout: res[0]) CORE_zplssq2(N, res); diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c index 93b332cbf9117375286077075141c3f4a7f14560..4cd62d842671401caf7a0422df7b1ac1d3801947 100644 --- a/runtime/parsec/codelets/codelet_zgessq.c +++ b/runtime/parsec/codelets/codelet_zgessq.c @@ -55,6 +55,6 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, sizeof(int), &lda, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zgram.c b/runtime/parsec/codelets/codelet_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..8d0217cc2552e329b51be45577c71d9399170098 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgram.c @@ -0,0 +1,131 @@ +/** + * + * @file parsec/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram PaRSEC codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +static inline int +CORE_zgegram_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + parsec_dtd_unpack_args( + this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &Dj, &lddj, &D, &A, &lda ); + + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +static inline int +CORE_zsygram_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *D; + double *A; + int lda; + + parsec_dtd_unpack_args( + this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &D, &A, &lda ); + + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Di, lddi, + D, + A, lda); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + double *ptrDi, *ptrDj; + + /* + * Test if Di is Dj, when we are on the diagonal. + * This to avoid having the same data twice in inputs (not handled in parsec). + */ + ptrDi = (double *)(RTBLKADDR( Di, double, Dim, Din )); + ptrDj = (double *)(RTBLKADDR( Dj, double, Djm, Djn )); + if (ptrDi == ptrDj) { + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zsygram_parsec, options->priority, "sygram", + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, + sizeof(int), &lddi, VALUE, + PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, + sizeof(int), &lda, VALUE, + PARSEC_DTD_ARG_END ); + } else { + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zgegram_parsec, options->priority, "gegram", + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, + sizeof(int), &lddi, VALUE, + PASSED_BY_REF, RTBLKADDR( Dj, double, Djm, Djn ), chameleon_parsec_get_arena_index( Dj ) | INPUT, + sizeof(int), &lddj, VALUE, + PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, + sizeof(int), &lda, VALUE, + PARSEC_DTD_ARG_END ); + } + +} diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c index 16f19a21fad3581ec4a170d7fae3a90b657d5350..96ef96dd5fbf51898396e9d79f7d64f32226b8bb 100644 --- a/runtime/parsec/codelets/codelet_zplssq.c +++ b/runtime/parsec/codelets/codelet_zplssq.c @@ -53,8 +53,8 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, sizeof(int), &storev, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INPUT, - PASSED_BY_REF, RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ) | INPUT, + PASSED_BY_REF, RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), chameleon_parsec_get_arena_index( SCLSSQ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } @@ -82,6 +82,6 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplssq2_parsec, options->priority, "plssq2", sizeof(int), &N, VALUE, - PASSED_BY_REF, RTBLKADDR( RESULT, double, RESULTm, RESULTn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( RESULT, double, RESULTm, RESULTn ), chameleon_parsec_get_arena_index( RESULT) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c index 8e86035ccbaafd400985c1ffea92b9e27643e588..d3797ca26c9966bbe8319e1cfd173a430688eaeb 100644 --- a/runtime/parsec/codelets/codelet_zsyssq.c +++ b/runtime/parsec/codelets/codelet_zsyssq.c @@ -55,6 +55,6 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, sizeof(int), &lda, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, + PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c index bf5396e4206dbd5f9389377ac11a16c45b4e24d7..a0f343091a76782f5853b87b73a84da415db1b46 100644 --- a/runtime/quark/codelets/codelet_zgessq.c +++ b/runtime/quark/codelets/codelet_zgessq.c @@ -41,13 +41,24 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { + int sizessq; + + if ( storev == ChamColumnwise ) { + sizessq = 2*n; + } else if ( storev == ChamRowwise ) { + sizessq = 2*m; + } else { + sizessq = 2; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_GESSQ; QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(CHAMELEON_Complex64_t)*m*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zgram.c b/runtime/quark/codelets/codelet_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..f2c4228ec1fd7dfbe0535d9fac109e903c12ce0f --- /dev/null +++ b/runtime/quark/codelets/codelet_zgram.c @@ -0,0 +1,68 @@ +/** + * + * @file quark/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram Quark codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-16 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +void CORE_zgram_quark(Quark *quark) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + quark_unpack_args_12(quark, uplo, m, n, mt, nt, Di, lddi, Dj, lddj, D, A, lda); + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); +} + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_GRAM; + QUARK_Insert_Task(opt->quark, CORE_zgram_quark, (Quark_Task_Flags*)opt, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + sizeof(double)*lddi*mt, RTBLKADDR(Di, double, Dim, Din), INPUT, + sizeof(int), &lddi, VALUE, + sizeof(double)*lddj*nt, RTBLKADDR(Dj, double, Djm, Djn), INPUT, + sizeof(int), &lddj, VALUE, + sizeof(double)*2, RTBLKADDR(D, double, Dm, Dn), INPUT, + sizeof(double)*mt*nt, RTBLKADDR(A, double, Am, An), INOUT, + sizeof(int), &lda, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c index 70810682c9a89cfba122782dbdff65149a17a09e..b340873af51c0e8fae9659def50170eecdb16f83 100644 --- a/runtime/quark/codelets/codelet_zplssq.c +++ b/runtime/quark/codelets/codelet_zplssq.c @@ -73,7 +73,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) { - int sizein = M*N; + int sizein = 2*M*N; int sizeout; if ( storev == ChamColumnwise ) { @@ -85,6 +85,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, } quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_PLSSQ; QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt, sizeof(int), &storev, VALUE, sizeof(int), &M, VALUE, @@ -109,6 +110,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_PLSSQ2; QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt, sizeof(int), &N, VALUE, sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c index cf1f39442c015139d869ae1fc08a5e4a79bd83a9..913424820fd8e01995ef2d889ebd97ac791758cc 100644 --- a/runtime/quark/codelets/codelet_zsyssq.c +++ b/runtime/quark/codelets/codelet_zsyssq.c @@ -41,13 +41,22 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { + int sizessq; + + if ( storev == ChamEltwise ) { + sizessq = 2; + } else { + sizessq = 2*n; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_SYSSQ; QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(CHAMELEON_Complex64_t)*n*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrssq.c b/runtime/quark/codelets/codelet_ztrssq.c index 92cac805f8dd2c791ccafaa1b0c88a6abb7a36fd..80bb9e78c3e41fff60f96fea3d9e9b768a25d65d 100644 --- a/runtime/quark/codelets/codelet_ztrssq.c +++ b/runtime/quark/codelets/codelet_ztrssq.c @@ -44,6 +44,7 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_TRSSQ; QUARK_Insert_Task(opt->quark, CORE_ztrssq_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, sizeof(int), &diag, VALUE, diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h index 83bdbd531b4161656c30388adc08f4f1ba883e9c..acc7ca8a120651d38deece54c9e3ebae9b9a855f 100644 --- a/runtime/quark/include/core_blas_dag.h +++ b/runtime/quark/include/core_blas_dag.h @@ -90,4 +90,12 @@ #define DAG_CORE_TTMQR DAG_CORE_TPMQRT #define DAG_CORE_TTQRT DAG_CORE_TPQRT +#define DAG_CORE_GESSQ DAG_SET_PROPERTIES( "GESSQ" , "white" ) +#define DAG_CORE_HESSQ DAG_SET_PROPERTIES( "HESSQ" , "white" ) +#define DAG_CORE_SYSSQ DAG_SET_PROPERTIES( "SYSSQ" , "white" ) +#define DAG_CORE_TRSSQ DAG_SET_PROPERTIES( "TRSSQ" , "white" ) +#define DAG_CORE_PLSSQ DAG_SET_PROPERTIES( "PLSSQ" , "white" ) +#define DAG_CORE_PLSSQ2 DAG_SET_PROPERTIES( "PLSSQ2" , "white" ) +#define DAG_CORE_GRAM DAG_SET_PROPERTIES( "GRAM" , "orange" ) + #endif /* _core_blas_dag_h_ */ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index dddcb6994da86b889f7d484828d86a9d71cc9814..44d431b7dafc684188f869aa560e0dabdb1f311a 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -22,7 +22,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -CHAMELEON_CL_CB(dzasum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(dzasum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zaxpy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[1]), 0, M) CHAMELEON_CL_CB(zgeadd, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlascal, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) @@ -34,6 +34,7 @@ CHAMELEON_CL_CB(zgessq, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(zgetrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_incpiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_nopiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgram, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), 0, M*N) CHAMELEON_CL_CB(zhe2ge, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (1./2.0)*M*N) CHAMELEON_CL_CB(zherfb, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2. *M* M*M) #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/runtime/starpu/codelets/codelet_zgram.c b/runtime/starpu/codelets/codelet_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..56af2b407d6bef0b9d2834ff6e8632c497d991c4 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgram.c @@ -0,0 +1,92 @@ +/** + * + * @file starpu/codelet_zgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram StarPU codelet + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-16 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgram_cpu_func(void *descr[], void *cl_arg) +{ + cham_uplo_t uplo; + int m, n, mt, nt; + double *Di; + int lddi; + double *Dj; + int lddj; + double *D; + double *A; + int lda; + + Di = (double *)STARPU_MATRIX_GET_PTR(descr[0]); + Dj = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + D = (double *)STARPU_MATRIX_GET_PTR(descr[2]); + A = (double *)STARPU_MATRIX_GET_PTR(descr[3]); + starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n, &mt, &nt, &lddi, &lddj, &lda); + CORE_zgram( uplo, + m, n, mt, nt, + Di, lddi, + Dj, lddj, + D, + A, lda); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgram, 4, cl_zgram_cpu_func) + +void INSERT_TASK_zgram( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int mt, int nt, + const CHAM_desc_t *Di, int Dim, int Din, int lddi, + const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An, int lda) +{ + struct starpu_codelet *codelet = &cl_zgram; + void (*callback)(void*) = options->profiling ? cl_zgram_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(Di, Dim, Din); + CHAMELEON_ACCESS_R(Dj, Djm, Djn); + CHAMELEON_ACCESS_R(D, Dm, Dn); + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &uplo, sizeof(int), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &mt, sizeof(int), + STARPU_VALUE, &nt, sizeof(int), + STARPU_R, RTBLKADDR(Di, double, Dim, Din), + STARPU_VALUE, &lddi, sizeof(int), + STARPU_R, RTBLKADDR(Dj, double, Djm, Djn), + STARPU_VALUE, &lddj, sizeof(int), + STARPU_R, RTBLKADDR(D, double, Dm, Dn), + STARPU_RW, RTBLKADDR(A, double, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgram", +#endif + 0); +} diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 4936d329c834cccb13f33c65df9e367695040728..c92f3a6583c94379f330ea9d6bad365395df2390 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -112,6 +112,7 @@ CODELETS_HEADER(dzasum) */ CODELETS_HEADER(zplrnt) CODELETS_HEADER(zbuild) +CODELETS_HEADER(zgram) #if defined(PRECISION_z) || defined(PRECISION_c) CODELETS_HEADER(zhessq) diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 40b853364f44b0f3b1e8ba56c4024bfffe291714..4ffbae9eabf6633234d2535c5a6f8e01d717dbc6 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -94,6 +94,7 @@ set(ZSRC #testing_zhegv.c #testing_zhegvd.c testing_zgeqrf_qdwh.c + testing_dgram.c ) # Add include and link directories diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 4fc8f3cf22991b7d6f6a705a4f350d2a6a855f51..a5cbc1710b27393d7ff3d7b8e691b7c8a8a48429 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -66,3 +66,11 @@ endforeach() # add_test(test_mpi_${prec}lange mpirun -np 4 ./${prec}testing 1 0 LANGE 600 500 600 --p=2) # endforeach() #endif() + +# Specific algorithms +# Gram +foreach(cat ${TEST_CATEGORIES}) + foreach(prec s;d) + add_test(test_${cat}_${prec}gram ./${prec}${TEST_CMD_${cat}} GRAM 117 213 ) + endforeach() +endforeach() \ No newline at end of file diff --git a/testing/testing_dgram.c b/testing/testing_dgram.c new file mode 100644 index 0000000000000000000000000000000000000000..c9ca38a3f7a3475e257f41d2a0c8343083efabcb --- /dev/null +++ b/testing/testing_dgram.c @@ -0,0 +1,261 @@ +/** + * + * @file testing_dgram.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dgram testing + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-12 + * @precisions normal d -> d s + * + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> + +#include <chameleon.h> +#include <coreblas/cblas.h> +#include <coreblas/lapacke.h> +#include <coreblas.h> +#include "testing_zauxiliary.h" +#if defined(CHAMELEON_USE_MPI) +#include <mpi.h> +#endif + +static int check_solution(cham_uplo_t uplo, + int N, + double *Aref, + double *Acham, int LDA); +static int compute_gram_sequential(cham_uplo_t uplo, + int N, + double *A, + int LDA); + +int testing_dgram(int argc, char **argv) +{ + int hres = 0; + /* Check for number of arguments */ + if ( argc < 2) { + USAGE("GRAM", "N LDA", + " - N : number of rows of matrix A\n" + " - LDA : leading dimension of matrix A\n"); + return -1; + } + int N = atoi(argv[0]); + int LDA = atoi(argv[1]); + + double eps; + int info_solution; + int i, j, ua; + int LDAxN = LDA*N; + + double *A = (double *)malloc(LDAxN*sizeof(double)); + double *Aref = (double *)malloc(LDAxN*sizeof(double)); + double *Acham = (double *)malloc(LDAxN*sizeof(double)); + + /* Check if unable to allocate memory */ + if ( (!A) || (!Aref) || (!Acham) ) + { + free(A); free(Aref); free(Acham); + printf("Out of Memory \n "); + return -2; + } + + eps = LAPACKE_dlamch_work('e'); + + if (CHAMELEON_My_Mpi_Rank() == 0){ + printf("\n"); + printf("------ TESTS FOR CHAMELEON GRAM ROUTINE ------- \n"); + printf(" Size of the Matrix %d by %d\n", N, N); + printf("\n"); + printf(" The matrix A is randomly generated for each test.\n"); + printf("============\n"); + printf(" The relative machine precision (eps) is to be %e \n",eps); + printf(" Computational tests pass if scaled residuals are less than 10.\n"); + } + + /*---------------------------------------------------------- + * TESTING GRAM + */ + + /* Initialize A such that it is symmetric */ + CHAMELEON_dplgsy( (double)N, ChamUpperLower, N, A, LDA, 51 ); + /* Gram is meant to be used with A full of positive values only */ +#if defined(PRECISION_d) || defined(PRECISION_s) + for (i=0; i<N; i++) { + for (j=0; j<N; j++) { + if ( A[i+j*LDA] < 0. ) { + A[i+j*LDA] = -A[i+j*LDA]; + } + } + } +#endif + + for (ua=0; ua<3; ua++) { + CHAMELEON_dlacpy( ChamUpperLower, N, N, A, LDA, Aref, LDA ); + CHAMELEON_dlacpy( ChamUpperLower, N, N, A, LDA, Acham, LDA ); + + /* CHAMELEON GRAM */ + CHAMELEON_dgram(uplo[ua], N, Acham, LDA); + + /* Check the solution */ + info_solution = check_solution(uplo[ua], N, Aref, Acham, LDA); + + if (CHAMELEON_My_Mpi_Rank() == 0){ + if (info_solution == 0) { + printf("***************************************************\n"); + printf(" ---- TESTING GRAM (%s) ............... PASSED !\n", uplostr[ua]); + printf("***************************************************\n"); + } + else { + printf("************************************************\n"); + printf(" - TESTING GRAM (%s) ... FAILED !\n", uplostr[ua]); hres++; + printf("************************************************\n"); + } + } + } + free(A); free(Aref); free(Acham); + + return hres; +} + +/*-------------------------------------------------------------- + * Check the solution + */ +static int check_solution(cham_uplo_t uplo, + int N, + double *Aref, + double *Acham, int LDA) +{ + int info_solution; + double Arefnorm, Rnorm, result; + double eps; + double mdone; + + double *work = (double *)malloc(N * sizeof(double)); + + mdone = -1.0; + + /* + * Compute the Gram matrix sequentially + * we consider the matrix on entry as symmetric + */ + compute_gram_sequential(uplo, N, Aref, LDA); + + /* Compute norm of Aref to scale the result norm */ + if (uplo == ChamUpperLower) { + Arefnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, 'I', + N, N, Aref, LDA, work); + } else { + Arefnorm = LAPACKE_dlantr_work(LAPACK_COL_MAJOR, 'I', + chameleon_lapack_const(uplo), chameleon_lapack_const(ChamNonUnit), + N, N, Aref, LDA, work); + } + /* compute the difference Aref = Aref - Acham */ + cblas_daxpy(LDA*N, mdone, Acham, 1, Aref, 1); + + /* compute the norm of the difference */ + if (uplo == ChamUpperLower) { + Rnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, 'I', + N, N, Aref, LDA, work); + } else { + Rnorm = LAPACKE_dlantr_work(LAPACK_COL_MAJOR, 'I', + chameleon_lapack_const(uplo), chameleon_lapack_const(ChamNonUnit), + N, N, Aref, LDA, work); + } + + eps = LAPACKE_dlamch_work('e'); + if (CHAMELEON_My_Mpi_Rank() == 0) + printf("Rnorm %e, Anorm %e\n", Rnorm, Arefnorm); + + /* scale the norm in respect to Aref */ + result = Rnorm / (Arefnorm * N * eps); + + if (CHAMELEON_My_Mpi_Rank() == 0){ + printf("============\n"); + printf("Checking the norm of the difference against reference GRAM \n"); + printf("-- ||Acham - Aref||_oo/((||Aref||_oo.N.eps) = %e \n", + result); + } + + if ( isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) { + if (CHAMELEON_My_Mpi_Rank() == 0) + printf("-- The solution is suspicious ! \n"); + info_solution = 1; + } + else { + if (CHAMELEON_My_Mpi_Rank() == 0) + printf("-- The solution is CORRECT ! \n"); + info_solution= 0 ; + } + + free(work); + + return info_solution; +} + +/*-------------------------------------------------------------- + * Compute the Gram matrix sequentially + * We consider the matrix on entry as symmetric + */ +static int compute_gram_sequential(cham_uplo_t uplo, + int N, + double *A, + int LDA) +{ + int m, n; + double eps; + double squareij, mean_dij, mhalf; + + double *work = (double *)malloc(N * sizeof(double)); + + mhalf = -0.5; + + /* initialize work */ + memset(work, 0., N*sizeof(double)); + + /* first: compute the means of squares */ + for (n=0; n<N; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, N) : N; + for (m = mmin; m < mmax; m++) { + squareij = A[m+n*LDA]*A[m+n*LDA]; + /* accumulate squares on columns */ + work[n] += squareij; + if ( m != n && uplo != ChamUpperLower ) { + /* accumulate squares on the symmetric part */ + work[m] += squareij; + } + } + } + mean_dij = 0.; + for (n=0; n<N; n++) { + /* accumulate the squares over the entire matrix */ + mean_dij += work[n]; + /* compute the mean on each column */ + work[n] /= N; + } + /* compute the global mean */ + mean_dij /= N*N; + /* second: compute the Gram matrix factors */ + for (n=0; n<N; n++) { + int mmin = ( uplo == ChamLower ) ? n : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, N) : N; + for (m = mmin; m < mmax; m++) { + squareij = A[m+n*LDA]*A[m+n*LDA]; + A[m+n*LDA] = mhalf*( squareij - work[m] - work[n] + mean_dij ); + } + } + + free(work); + + return 0; +} \ No newline at end of file diff --git a/testing/testing_zauxiliary.c b/testing/testing_zauxiliary.c index a880d6e7152042fd1d07be5f361befe7b99d4f71..525edc553c5e9247f8a899f5567540df52b0d07d 100644 --- a/testing/testing_zauxiliary.c +++ b/testing/testing_zauxiliary.c @@ -35,7 +35,7 @@ int ISEED[4] = {0,0,0,1}; /* initial seed for zlarnv() */ cham_storage_t format[6] = { ChamCM, ChamRM, ChamCCRB, ChamCRRB, ChamRCRB, ChamRRRB }; cham_side_t side[2] = { ChamLeft, ChamRight }; -cham_uplo_t uplo[2] = { ChamUpper, ChamLower }; +cham_uplo_t uplo[3] = { ChamUpper, ChamLower, ChamUpperLower }; cham_diag_t diag[2] = { ChamNonUnit, ChamUnit }; cham_trans_t trans[3] = { ChamNoTrans, ChamTrans, ChamConjTrans }; int itype[3] = { 1, 2, 3 }; @@ -44,7 +44,7 @@ cham_normtype_t norm[4] = { ChamMaxNorm, ChamOneNorm, ChamInfNorm, ChamFrobeni char *formatstr[6]= { "CM", "RM", "CCRB", "CRRB", "RCRB", "RRRB"}; char *sidestr[2] = { "Left ", "Right" }; -char *uplostr[2] = { "Upper", "Lower" }; +char *uplostr[3] = { "Upper", "Lower", "UpperLower" }; char *diagstr[2] = { "NonUnit", "Unit " }; char *transstr[3] = { "N", "T", "H" }; char *itypestr[3] = { "inv(U')xAxinv(U) or inv(L)xAxinv(L')", "UxAxU' or L'xAxL", "UxAxU' or L'xAxL" }; @@ -312,6 +312,14 @@ int main (int argc, char **argv) else if ( strcmp(func, "GEQRF_QDWH") == 0 ) { info += testing_zgeqrf_qdwh( argc, argv ); } + /* + * Gram Matrix + */ +#if defined(PRECISION_d) || defined(PRECISION_s) + else if ( strcmp(func, "GRAM") == 0 ) { + info += testing_zgram( argc, argv ); + } +#endif else { fprintf(stderr, "Function unknown\n"); } diff --git a/testing/testing_zauxiliary.h b/testing/testing_zauxiliary.h index 30b4bc004f989dfb5441be8483bc18e88d64264a..efc0fa2cc49ff89e47d234b37cd1bddf188e6537 100644 --- a/testing/testing_zauxiliary.h +++ b/testing/testing_zauxiliary.h @@ -49,7 +49,7 @@ extern int ISEED[4]; extern cham_storage_t format[6]; extern cham_trans_t trans[3]; -extern cham_uplo_t uplo[2]; +extern cham_uplo_t uplo[3]; extern cham_side_t side[2]; extern cham_diag_t diag[2]; extern int itype[3]; @@ -58,7 +58,7 @@ extern cham_normtype_t norm[4]; extern char *formatstr[6]; extern char *transstr[3]; -extern char *uplostr[2]; +extern char *uplostr[3]; extern char *sidestr[2]; extern char *diagstr[2]; extern char *itypestr[3]; @@ -115,4 +115,6 @@ int testing_zcgesv(int argc, char **argv); int testing_zcungesv(int argc, char **argv); #endif +int testing_zgram(int argc, char **argv); + #endif /* _testing_zauxiliary_h_ */