diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 10a56bd0e1e3b0531944fbd926a71231cd887539..eb8cb418f2152666a28fcd62cfcff71cc76ac1c0 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -23,6 +23,8 @@ _extra_blas = [ ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), ('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ), + ('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ), + ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index d4b97797a312ddfb6ca28c31ebde693afaba14cd..f7f3c27b8149bc441b861d94dba3dd1b6d6452b5 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -235,6 +235,11 @@ set(ZSRC zbuild.c pzbuild.c ################## + # Center-Scaled + ################## + zcesca.c + pzcesca.c + ################## # Gram ################## zgram.c diff --git a/compute/pzcesca.c b/compute/pzcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..4b6776b7c0d1e69efa75a1037f28495be919ee4e --- /dev/null +++ b/compute/pzcesca.c @@ -0,0 +1,304 @@ +/** + * + * @file pzcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca parallel algorithm + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" + +#define A( m, n ) A, (m), (n) +#define W( desc, m, n ) (desc), (m), (n) + +static inline void +chameleon_pzcesca_internal( int center, + int scale, + cham_store_t axis, + CHAM_desc_t *A, + CHAM_desc_t *Wgcol, + CHAM_desc_t *Wgrow, + CHAM_desc_t *Wgelt, + CHAM_desc_t *Wdcol, + CHAM_desc_t *Wdrow, + RUNTIME_option_t *options ) +{ + int m, n; + int MT = A->mt; + int NT = A->nt; + int M = A->m; + int N = A->n; + int P = A->p; + int Q = A->q; + + /** + * 1) compute sums and sum-square (scl,ssq) in each tile + */ + for(n = 0; n < NT; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + for(m = 0; m < MT; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + if ( (center == 1) && ( (axis == ChamColumnwise) || (axis == ChamEltwise) ) ) { + INSERT_TASK_zgesum( + options, ChamColumnwise, tempmm, tempnn, + A(m, n), W( Wgcol, m, n) ); + } + if ( (center == 1) && ( (axis == ChamRowwise) || (axis == ChamEltwise) ) ) { + INSERT_TASK_zgesum( + options, ChamRowwise, tempmm, tempnn, + A(m, n), W( Wgrow, m, n) ); + } + if ( (scale == 1) && (axis == ChamColumnwise) ) { + INSERT_TASK_dgessq( + options, ChamColumnwise, tempmm, tempnn, + A(m, n), W( Wdcol, m, n) ); + } + if ( (scale == 1) && (axis == ChamRowwise) ) { + INSERT_TASK_dgessq( + options, ChamRowwise, tempmm, tempnn, + A(m, n), W( Wdrow, m, n) ); + } + } + } + + for(n = 0; n < NT; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + if ( (center == 1) && ( (axis == ChamColumnwise) || (axis == ChamEltwise) ) ) { + /** + * 2) reduce sums of columns between tiles per processus (between lines) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, 1, tempnn, Wgcol->nb, + 1., W( Wgcol, m, n ), + 1., W( Wgcol, m%P, n ) ); + } + /** + * 3) reduce sums of columns between tiles of different processus on the first line of tiles + */ + for(m = 1; m < P; m++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, 1, tempnn, Wgcol->nb, + 1., W( Wgcol, m, n ), + 1., W( Wgcol, 0, n ) ); + } + if ( axis == ChamEltwise ) { + /* 4) reduce sums inside each tile of the first line of tiles for the global sum */ + INSERT_TASK_zgesum( + options, ChamEltwise, 1, tempnn, + W( Wgcol, 0, n ), + W( Wgelt, 0, n ) ); + } + } + + if ( (scale == 1) && (axis == ChamColumnwise) ) { + /** + * 2) reduce columns (scl,ssq) tiles per processus (between lines) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wdcol, m, n ), + W( Wdcol, m%P, n ) ); + } + /** + * 3) reduce columns (scl,ssq) tiles on the first line of tiles + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamColumnwise, 1, tempnn, + W( Wdcol, m, n ), + W( Wdcol, 0, n ) ); + } + /* 5) deduce the sum square for each column from the pairs (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, tempnn, W( Wdcol, 0, n ) ); + } + } + + for(m = 0; m < MT; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + + if ( (center == 1) && ( (axis == ChamRowwise) || (axis == ChamEltwise) ) ) { + /** + * 2) reduce sums of rows between tiles per processus (between columns) + */ + for(n = Q; n < NT; n++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, tempmm, 1, Wgrow->mb, + 1., W( Wgrow, m, n ), + 1., W( Wgrow, m, n%Q ) ); + } + /** + * 3) reduce sums of rows between tiles of different processus on the first columns of tiles + */ + for(n = 1; n < Q; n++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, tempmm, 1, Wgcol->nb, + 1., W( Wgrow, m, n ), + 1., W( Wgrow, m, 0 ) ); + } + } + + if ( (scale == 1) && (axis == ChamRowwise) ) { + /** + * 2) reduce rows (scl,ssq) tiles per processus (between lines) + */ + for(n = Q; n < NT; n++) { + INSERT_TASK_dplssq( + options, ChamRowwise, tempmm, 1, + W( Wdrow, m, n ), + W( Wdrow, m, n%Q ) ); + } + /** + * 3) reduce rows (scl,ssq) tiles on the first column of tiles + */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamRowwise, tempmm, 1, + W( Wdrow, m, n ), + W( Wdrow, m, 0 ) ); + } + /* 5) deduce the sum square for each column from the pairs (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */ + INSERT_TASK_dplssq2( options, tempmm, W( Wdrow, m, 0 ) ); + } + } + + if ( (center == 1) && (axis == ChamEltwise) ) { + /* 6) reduce global sum on each processus (between columns) */ + for(n = Q; n < NT; n++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, 1, 1, 1, + 1., W( Wgelt, 0, n), + 1., W( Wgelt, 0, n%Q) ); + } + + /* 7) reduce global sum on the first tile (index 0, 0) */ + for(n = 1; n < Q; n++) { + INSERT_TASK_zgeadd( + options, ChamNoTrans, 1, 1, 1, + 1., W( Wgelt, 0, n), + 1., W( Wgelt, 0, 0) ); + } + } + + /* Finally compute Centered-Scaled matrix coefficients inplace */ + for(n = 0; n < NT; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + for(m = 0; m < MT; m++) { + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + + INSERT_TASK_zcesca( + options, + center, scale, axis, + A->m, A->n, tempmm, tempnn, + W( Wgrow, m, 0 ), + W( Wgcol, 0, n ), + W( Wgelt, 0, 0 ), + W( Wdrow, m, 0 ), + W( Wdcol, 0, n ), + A( m, n ) ); + } + } +} + +/** + * + */ +void chameleon_pzcesca( struct chameleon_pzcesca_s *ws, int center, int scale, cham_store_t axis, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t *Wgcol = &(ws->Wgcol); + CHAM_desc_t *Wgrow = &(ws->Wgrow); + CHAM_desc_t *Wgelt = &(ws->Wgelt); + CHAM_desc_t *Wdcol = &(ws->Wdcol); + CHAM_desc_t *Wdrow = &(ws->Wdrow); + int m, n, tempmm, tempnn; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + /* Initialize Wgcol */ + for(m = 0; m < Wgcol->mt; m++) { + tempmm = m == Wgcol->mt-1 ? Wgcol->m-m*Wgcol->mb : Wgcol->mb; + for(n = 0; n < Wgcol->nt; n++) { + tempnn = n == Wgcol->nt-1 ? Wgcol->n-n*Wgcol->nb : Wgcol->nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + 0., 0., + W( Wgcol, m, n ) ); + } + } + /* Initialize Wgrow */ + for(m = 0; m < Wgrow->mt; m++) { + tempmm = m == Wgrow->mt-1 ? Wgrow->m-m*Wgrow->mb : Wgrow->mb; + for(n = 0; n < Wgrow->nt; n++) { + tempnn = n == Wgrow->nt-1 ? Wgrow->n-n*Wgrow->nb : Wgrow->nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + 0., 0., + W( Wgrow, m, n ) ); + } + } + /* Initialize Wgelt */ + for(m = 0; m < Wgelt->mt; m++) { + tempmm = m == Wgelt->mt-1 ? Wgelt->m-m*Wgelt->mb : Wgelt->mb; + for(n = 0; n < Wgelt->nt; n++) { + tempnn = n == Wgelt->nt-1 ? Wgelt->n-n*Wgelt->nb : Wgelt->nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + 0., 0., + W( Wgelt, m, n ) ); + } + } + /* Initialize Wdcol */ + for(m = 0; m < Wdcol->mt; m++) { + tempmm = m == Wdcol->mt-1 ? Wdcol->m-m*Wdcol->mb : Wdcol->mb; + for(n = 0; n < Wdcol->nt; n++) { + tempnn = n == Wdcol->nt-1 ? Wdcol->n-n*Wdcol->nb : Wdcol->nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( Wdcol, m, n ) ); + } + } + /* Initialize Wdrow */ + for(m = 0; m < Wdrow->mt; m++) { + tempmm = m == Wdrow->mt-1 ? Wdrow->m-m*Wdrow->mb : Wdrow->mb; + for(n = 0; n < Wdrow->nt; n++) { + tempnn = n == Wdrow->nt-1 ? Wdrow->n-n*Wdrow->nb : Wdrow->nb; + INSERT_TASK_dlaset( + &options, + ChamUpperLower, tempmm, tempnn, + -1., -1., + W( Wdrow, m, n ) ); + } + } + + chameleon_pzcesca_internal( center, scale, axis, A, Wgcol, Wgrow, Wgelt, Wdcol, Wdrow, &options ); + + CHAMELEON_Desc_Flush( Wgcol, sequence ); + CHAMELEON_Desc_Flush( Wgrow, sequence ); + CHAMELEON_Desc_Flush( Wgelt, sequence ); + CHAMELEON_Desc_Flush( Wdcol, sequence ); + CHAMELEON_Desc_Flush( Wdrow, sequence ); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zcesca.c b/compute/zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..446951edcaaaac0931e1196e9c2467c2a0f05c27 --- /dev/null +++ b/compute/zcesca.c @@ -0,0 +1,457 @@ +/** + * + * @file zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca wrappers + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> s d c z + * + */ +#include "control/common.h" +#include <stdlib.h> + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * CHAMELEON_zcesca_WS_Alloc - Allocate the required workspaces for asynchronous center + * + ******************************************************************************* + * + * @param[in] A + * The descriptor of the matrix A. + * + ******************************************************************************* + * + * @retval An allocated opaque pointer to use in CHAMELEON_zcesca_Tile_Async() + * and to free with CHAMELEON_zcesca_WS_Free(). + * + ******************************************************************************* + * + * @sa CHAMELEON_zcesca_Tile_Async + * @sa CHAMELEON_zcesca_WS_Free + * + */ +void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzcesca_s *options; + int workmt, worknt; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + return NULL; + } + + options = calloc( 1, sizeof(struct chameleon_pzcesca_s) ); + + workmt = chameleon_max( A->mt, A->p ); + worknt = chameleon_max( A->nt, A->q ); + + chameleon_desc_init( &(options->Wgcol), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, 1, A->nb, A->nb, + workmt, A->n, 0, 0, + workmt, A->n, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &(options->Wgrow), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, A->mb, 1, A->mb, + A->m, worknt, 0, 0, + A->m, worknt, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &(options->Wgelt), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, 1, 1, 1, + 1, worknt, 0, 0, + 1, worknt, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &(options->Wdcol), CHAMELEON_MAT_ALLOC_TILE, + ChamRealDouble, 2, A->nb, 2*A->nb, + 2*workmt, A->n, 0, 0, + 2*workmt, A->n, A->p, A->q, + NULL, NULL, NULL ); + + chameleon_desc_init( &(options->Wdrow), CHAMELEON_MAT_ALLOC_TILE, + ChamRealDouble, A->mb, 2, 2*A->mb, + A->m, 2*worknt, 0, 0, + A->m, 2*worknt, A->p, A->q, + NULL, NULL, NULL ); + + return (void*)options; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Free the allocated workspaces for asynchronous center + * + ******************************************************************************* + * + * @param[in,out] user_ws + * On entry, the opaque pointer allocated by CHAMELEON_zcesca_WS_Alloc() + * On exit, all data are freed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zcesca_Tile_Async + * @sa CHAMELEON_zcesca_WS_Alloc + * + */ +void CHAMELEON_zcesca_WS_Free( void *user_ws ) +{ + struct chameleon_pzcesca_s *ws = (struct chameleon_pzcesca_s*)user_ws; + + chameleon_desc_destroy( &(ws->Wgcol) ); + chameleon_desc_destroy( &(ws->Wgrow) ); + chameleon_desc_destroy( &(ws->Wgelt) ); + chameleon_desc_destroy( &(ws->Wdcol) ); + chameleon_desc_destroy( &(ws->Wdrow) ); + free( ws ); +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * CHAMELEON_zcesca replace a general matrix by the Centered-Scaled matrix inplace + * + * Considering a matrix A of size m x n, \f[A = (a_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] + * Lets + * \f[g_i = \frac{1}{n} \sum_j a_{ij} \\ + * g_j = \frac{1}{m} \sum_i a_{ij} \\ + * g = \frac{1}{mn} \sum_{i,j} a_{ij}\f] + * A centered rowwise gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_i \f] + * A centered columnwise gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_j \f] + * A bicentered gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_i - g_j + g \f] + * Lets + * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}²} \\ + * d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}²} \f] + * A scaled rowwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ a_{i*}' = \frac{a_{i*}}{d_i} \f] + * A scaled columnwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ a_{*j}' = \frac{a_{*j}}{d_j} \f] + * + ******************************************************************************* + * + * @param[in] center + * 1 if A must be centered, else 0. + * + * @param[in] scale + * 1 if A must be scaled, else 0. + * + * @param[in] axis + * Specifies the axis over which to center and or scale. + * = ChamColumnwise: centered column-wise + * = ChamRowwise: centered row-wise + * = ChamEltwise: bi-centered (only compatible if center=1 and scale=0) + * + * @param[in] M + * The number of rows of the overall matrix. M >= 0. + * + * @param[in] N + * The number of columns of the overall matrix. N >= 0. + * + * @param[in] A + * The M-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * +* @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zcesca_Tile + * @sa CHAMELEON_zcesca_Tile_Async + * @sa CHAMELEON_scesca + * + */ +int CHAMELEON_zcesca(int center, int scale, cham_store_t axis, int M, int N, CHAMELEON_Complex64_t *A, int LDA ) +{ + int NB; + int status; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + void *ws; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zcesca", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if ( (center != 0) && (center != 1) ) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of center"); + return -1; + } + if ( (scale != 0) && (scale != 1) ) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of scale"); + return -2; + } + if ( (axis != ChamColumnwise) && (axis != ChamRowwise) && (axis != ChamEltwise) ) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of axis"); + return -3; + } + if ( (axis == ChamEltwise) && (center == 1) && (scale == 1) ) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of axis and/or scale, center=1 and axis=ChamEltwise (i.e. bi-centered) must not be used with scale=1"); + return -3; + } + if (M < 0) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of M"); + return -4; + } + if (N < 0) { + chameleon_error("CHAMELEON_zcesca", "Illegal value of N"); + return -5; + } + if (LDA < chameleon_max(1, M)) { + chameleon_error("CHAMELEON_zcesca", "illegal value of LDA"); + return -7; + } + + /* Quick return */ + if (N == 0) + return CHAMELEON_SUCCESS; + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_DGEMM, M, N, 0); + if (status != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zcesca", "chameleon_tune() failed"); + return status; + } + + /* Set NT */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, + A, NB, NB, LDA, N, N, N, sequence, &request ); + + /* Call the tile interface */ + ws = CHAMELEON_zcesca_WS_Alloc( &descAt ); + CHAMELEON_zcesca_Tile_Async( center, scale, axis, &descAt, ws, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInout, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + CHAMELEON_zcesca_WS_Free( ws ); + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * CHAMELEON_zcesca_Tile - Tile equivalent of CHAMELEON_zcesca(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] center + * 1 if A must be centered, else 0. + * + * @param[in] scale + * 1 if A must be scaled, else 0. + * + * @param[in] axis + * Specifies the axis over which to center and or scale. + * = ChamColumnwise: centered column-wise + * = ChamRowwise: centered row-wise + * = ChamEltwise: bi-centered (only compatible if center=1 and scale=0) + * + * @param[in] A + * The M-by-N matrix A. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zcesca + * @sa CHAMELEON_zcesca_Tile_Async + * @sa CHAMELEON_scesca_Tile + * + */ +int CHAMELEON_zcesca_Tile( int center, int scale, cham_store_t axis, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + void *ws; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zcesca_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + + /* Check input arguments */ + if ( (center != 0) && (center != 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile", "Illegal value of center"); + return -1; + } + if ( (scale != 0) && (scale != 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile", "Illegal value of scale"); + return -2; + } + if ( (axis != ChamColumnwise) && (axis != ChamRowwise) && (axis != ChamEltwise) ) { + chameleon_error("CHAMELEON_zcesca_Tile", "Illegal value of axis"); + return -3; + } + if ( (axis == ChamEltwise) && (center == 1) && (scale == 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile", "Illegal value of axis and/or scale, center=1 and axis=ChamEltwise (i.e. bi-centered) must not be used with scale=1"); + return -3; + } + + chameleon_sequence_create( chamctxt, &sequence ); + + ws = CHAMELEON_zcesca_WS_Alloc( A ); + CHAMELEON_zcesca_Tile_Async( center, scale, axis, A, ws, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + + CHAMELEON_zcesca_WS_Free( ws ); + + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * CHAMELEON_zcesca_Tile_Async - Non-blocking equivalent of CHAMELEON_zcesca_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zcesca + * @sa CHAMELEON_zcesca_Tile + * @sa CHAMELEON_scesca_Tile_Async + * + */ +int CHAMELEON_zcesca_Tile_Async( int center, int scale, cham_store_t axis, CHAM_desc_t *A, void *user_ws, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzcesca_s *ws; + + /* Check input arguments */ + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zcesca_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if ( (center != 0) && (center != 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "Illegal value of center"); + return -1; + } + if ( (scale != 0) && (scale != 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "Illegal value of scale"); + return -2; + } + if ( (axis != ChamColumnwise) && (axis != ChamRowwise) && (axis != ChamEltwise) ) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "Illegal value of axis"); + return -3; + } + if ( (axis == ChamEltwise) && (center == 1) && (scale == 1) ) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "Illegal value of axis and/or scale, center=1 and axis=ChamEltwise (i.e. bi-centered) must not be used with scale=1"); + return -3; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zcesca_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zcesca_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zcesca_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Quick return */ + if (chameleon_min(A->m, A->n) == 0) { + return CHAMELEON_SUCCESS; + } + + if ( user_ws == NULL ) { + ws = CHAMELEON_zcesca_WS_Alloc( A ); + } + else { + ws = user_ws; + } + + chameleon_pzcesca( ws, center, scale, axis, A, sequence, request ); + + if ( user_ws == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zcesca_WS_Free( ws ); + } + + return CHAMELEON_SUCCESS; +} diff --git a/compute/zgram.c b/compute/zgram.c index 73427cb6fded3bb2fb56f84d10c30e1d9988085c..1b9d02122f01a23c15ba80cd6d7351eba4884b1e 100644 --- a/compute/zgram.c +++ b/compute/zgram.c @@ -80,7 +80,7 @@ void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A ) * * @ingroup CHAMELEON_Complex64_t * - * @brief Free the allocated workspaces for asynchronous gemm + * @brief Free the allocated workspaces for asynchronous gram * ******************************************************************************* * @@ -154,13 +154,20 @@ int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA return CHAMELEON_ERR_NOT_INITIALIZED; } /* Check input arguments */ + if ((uplo != ChamUpper) && + (uplo != ChamLower) && + (uplo != ChamUpperLower)) + { + chameleon_error("CHAMELEON_zgram", "illegal value of uplo"); + return -1; + } if (N < 0) { chameleon_error("CHAMELEON_zgram", "illegal value of N"); - return -1; + return -2; } if (LDA < chameleon_max(1, N)) { chameleon_error("CHAMELEON_zgram", "illegal value of LDA"); - return -3; + return -4; } /* Quick return */ @@ -241,11 +248,21 @@ int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); return CHAMELEON_ERR_NOT_INITIALIZED; } + /* Check input arguments */ + if ((uplo != ChamUpper) && + (uplo != ChamLower) && + (uplo != ChamUpperLower)) + { + chameleon_error("CHAMELEON_zgram_Tile", "illegal value of uplo"); + return -1; + } chameleon_sequence_create( chamctxt, &sequence ); ws = CHAMELEON_zgram_WS_Alloc( A ); CHAMELEON_zgram_Tile_Async( uplo, A, ws, sequence, &request ); + CHAMELEON_Desc_Flush( A, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); CHAMELEON_zgram_WS_Free( ws ); @@ -288,15 +305,23 @@ int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, void *user_ws, chamctxt = chameleon_context_self(); if (chamctxt == NULL) { - chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized"); + chameleon_fatal_error("CHAMELEON_zgram_Tile_Async", "CHAMELEON not initialized"); return CHAMELEON_ERR_NOT_INITIALIZED; } + /* Check input arguments */ + if ((uplo != ChamUpper) && + (uplo != ChamLower) && + (uplo != ChamUpperLower)) + { + chameleon_error("CHAMELEON_zgram_Tile_Async", "illegal value of uplo"); + return -1; + } if (sequence == NULL) { - chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL sequence"); + chameleon_fatal_error("CHAMELEON_zgram_Tile_Async", "NULL sequence"); return CHAMELEON_ERR_UNALLOCATED; } if (request == NULL) { - chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL request"); + chameleon_fatal_error("CHAMELEON_zgram_Tile_Async", "NULL request"); return CHAMELEON_ERR_UNALLOCATED; } /* Check sequence status */ diff --git a/control/compute_z.h b/control/compute_z.h index 53e39a1c10775a454b5c2983c006eeefd207b826..acc3c54458b4d0462be8adbe6f20404daa5cd643 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -34,6 +34,18 @@ struct chameleon_pzgemm_s { CHAM_desc_t WA; CHAM_desc_t WB; }; + +/** + * @brief Data structure to handle the Centering-Scaled workspaces + */ +struct chameleon_pzcesca_s { + CHAM_desc_t Wgcol; + CHAM_desc_t Wgrow; + CHAM_desc_t Wgelt; + CHAM_desc_t Wdcol; + CHAM_desc_t Wdrow; +}; + /** * @brief Data structure to handle the GRAM workspaces */ @@ -155,6 +167,10 @@ void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +/** + * Centered-Scaled function prototypes + */ +void chameleon_pzcesca( struct chameleon_pzcesca_s *ws, int center, int scale, cham_store_t axis, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); /** * Gram function prototypes */ diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 081a231b89a1cb26b45edcc7fb1eb36ebdb70458..15df8b4f91c6148997903d7f3297d484cbf6f12d 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -35,6 +35,7 @@ set(ZSRC core_dlag2z.c core_dzasum.c core_zaxpy.c + core_zcesca.c core_zgeadd.c core_zlascal.c core_zlatm1.c @@ -43,6 +44,7 @@ set(ZSRC core_zgemm.c core_zgeqrt.c core_zgesplit.c + core_zgesum.c core_zgessm.c core_zgessq.c core_zgetf2_nopiv.c diff --git a/coreblas/compute/core_zcesca.c b/coreblas/compute/core_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..5d95d2eefb24b2004a2f61fabf6b6235a7973bb0 --- /dev/null +++ b/coreblas/compute/core_zcesca.c @@ -0,0 +1,233 @@ +/** + * + * @file core_zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zcesca CPU kernel + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "coreblas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zcesca computes the Centered-Scaled matrix factors of A inplace + * + * Considering a matrix A of size m x n, \f[A = (a_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] + * Lets + * \f[g_i = \frac{1}{n} \sum_j a_{ij} \\ + * g_j = \frac{1}{m} \sum_i a_{ij} \\ + * g = \frac{1}{mn} \sum_{i,j} a_{ij}\f] + * A centered rowwise gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_i \f] + * A centered columnwise gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_j \f] + * A bicentered gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ \bar{a}_{i,j} = a_{i,j} - g_i - g_j + g \f] + * Lets + * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}²} \\ + * d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}²} \f] + * A scaled rowwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ a_{i*}' = \frac{a_{i*}}{d_i} \f] + * A scaled columnwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that + * \f[ a_{*j}' = \frac{a_{*j}}{d_j} \f] + * + ******************************************************************************* + * + * @param[in] center + * 1 if A must be centered, else 0. + * + * @param[in] scale + * 1 if A must be scaled, else 0. + * + * @param[in] axis + * Specifies the axis over which to center and or scale. + * = ChamColumnwise: centered column-wise + * = ChamRowwise: centered row-wise + * = ChamEltwise: bi-centered (only compatible if center=1 and scale=0) + * + * @param[in] M + * The number of rows of the overall matrix. M >= 0. + * + * @param[in] N + * The number of columns of the overall matrix. N >= 0. + * + * @param[in] Mt + * The number of rows of the tile A. Mt >= 0. + * + * @param[in] Nt + * The number of columns of the tile A. Nt >= 0. + * + * @param[in] Gi + * The 1-by-Mt tile containing the sum of values by rows. + * + * @param[in] LDGI + * The leading dimension of the array Gi. + * + * @param[in] Gj + * The 1-by-Nt tile containing the sum of values by columns. + * + * @param[in] LDGJ + * The leading dimension of the array Gj. + * + * @param[in] G + * The sum of values of all the matrix. + * + * @param[in] Di + * The 1-by-Mt tile containing the sum of squares by rows. + * + * @param[in] LDDI + * The leading dimension of the array Di. + * + * @param[in] Dj + * The 1-by-Nt tile containing the sum of squares by columns. + * + * @param[in] LDDJ + * The leading dimension of the array Dj. + * + * @param[in,out] A + * On entry, the M-by-N tile A. + * On exit, updated by the centered-scaled matrix coefficients. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if INFO = -k, the k-th argument had an illegal value + * + */ + +int CORE_zcesca( int center, int scale, + cham_store_t axis, + int M, int N, + int Mt, int Nt, + const CHAMELEON_Complex64_t *Gi, int LDGI, + const CHAMELEON_Complex64_t *Gj, int LDGJ, + const CHAMELEON_Complex64_t *G, + const double *Di, int LDDI, + const double *Dj, int LDDJ, + CHAMELEON_Complex64_t *A, int LDA ) +{ + int i, j; + CHAMELEON_Complex64_t gi, gj, g; + double di, dj; + + /* Check input arguments */ + if ( (center != 0) && (center != 1) ) { + coreblas_error(1, "Illegal value of center"); + return -1; + } + if ( (scale != 0) && (scale != 1) ) { + coreblas_error(2, "Illegal value of scale"); + return -2; + } + if ( (axis != ChamColumnwise) && (axis != ChamRowwise) && (axis != ChamEltwise) ) { + coreblas_error(3, "Illegal value of axis"); + return -3; + } + if ( (axis == ChamEltwise) && (center == 1) && (scale == 1) ) { + coreblas_error(3, "Illegal value of axis and/or scale, center=1 and axis=ChamEltwise (i.e. bi-centered) must not be used with scale=1"); + return -3; + } + if (M < 0) { + coreblas_error(4, "Illegal value of M"); + return -4; + } + if (N < 0) { + coreblas_error(5, "Illegal value of N"); + return -5; + } + if (Mt < 0) { + coreblas_error(6, "Illegal value of Mt"); + return -6; + } + if (Nt < 0) { + coreblas_error(7, "Illegal value of Nt"); + return -7; + } + if (LDGI < 0) { + coreblas_error(9, "Illegal value of LDGI"); + return -9; + } + if (LDGJ < 0) { + coreblas_error(11, "Illegal value of LDGJ"); + return -11; + } + if (LDDI < 0) { + coreblas_error(14, "Illegal value of LDDI"); + return -14; + } + if (LDDJ < 0) { + coreblas_error(16, "Illegal value of LDDJ"); + return -16; + } + if ((LDA < chameleon_max(1,Mt)) && (Mt > 0)) { + coreblas_error(18, "Illegal value of LDA"); + return -18; + } + + /* Quick return */ + if ((center == 0) && (scale == 0)) { + return CHAMELEON_SUCCESS; + } + if ((Mt == 0) || (Nt == 0)) { + return CHAMELEON_SUCCESS; + } + + if ( (center == 1) && (axis == ChamEltwise) ) { + /* overall mean of values */ + g = G[0] / ( (double)M * (double)N ); + } + + for(j = 0; j < Nt; j++) { + if ( (center == 1) && ( (axis == ChamColumnwise) || (axis == ChamEltwise) ) ) { + /* mean of values of the column */ + gj = Gj[j*LDGJ] / ((double)M); + } + if ( (scale == 1) && (axis == ChamColumnwise) ) { + /* norm 2 of the column */ + dj = Dj[j*LDDJ]; + } + for(i = 0; i < Mt; i++) { + if ( (center == 1) && ( (axis == ChamRowwise) || (axis == ChamEltwise) ) ) { + /* mean of values of the row */ + gi = Gi[i] / ((double)N); + /* compute centered matrix factor */ + A[j*LDA+i] -= gi; + } + if ( (center == 1) && ( (axis == ChamColumnwise) || (axis == ChamEltwise) ) ) { + /* compute centered matrix factor */ + A[j*LDA+i] -= gj; + } + if ( (center == 1) && (axis == ChamEltwise) ) { + /* compute centered matrix factor */ + A[j*LDA+i] += g; + } + if ( (scale == 1) && (axis == ChamColumnwise) ) { + /* compute scaled matrix factor */ + A[j*LDA+i] /= dj; + } + if ( (scale == 1) && (axis == ChamRowwise) ) { + /* norm 2 of the row */ + di = Di[i]; + /* compute scaled matrix factor */ + A[j*LDA+i] /= di; + } + } + } + + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_zgesum.c b/coreblas/compute/core_zgesum.c new file mode 100644 index 0000000000000000000000000000000000000000..be8d2cf7eb92b4af064cc3d02a379e8c873734f4 --- /dev/null +++ b/coreblas/compute/core_zgesum.c @@ -0,0 +1,148 @@ +/** + * + * @file core_zgesum.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgesum CPU kernel + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include <math.h> +#include "coreblas/lapacke.h" +#include "coreblas.h" + +/** + * @brief Subcase storev == ChamColumnwise of CORE_zgesum() + */ +static inline int +CORE_zgesum_col( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *sum ) +{ + int i, j; + CHAMELEON_Complex64_t *ptr = A; + for(j=0; j<N; j++) { + for(i=0; i<M; i++) { + sum[j] += *ptr; + ptr++; + } + ptr += LDA - M; + } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamRowwise of CORE_zgesum() + */ +static inline int +CORE_zgesum_row( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *sum ) +{ + int i, j; + CHAMELEON_Complex64_t *ptr = A; + for(j=0; j<N; j++) { + for(i=0; i<M; i++) { + sum[i] += *ptr; + ptr++; + } + ptr += LDA - M; + } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamEltwise of CORE_zgesum() + */ +static inline int +CORE_zgesum_elt( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *sum ) +{ + int i, j; + CHAMELEON_Complex64_t *ptr = A; + for(j=0; j<N; j++) { + for(i=0; i<M; i++) { + sum[0] += *ptr; + ptr++; + } + ptr += LDA - M; + } + return CHAMELEON_SUCCESS; +} + +/** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zgesum returns the sum of a matrix values over a chosen axis: + * column-wise, row-wise, element-wise. + * + ******************************************************************************* + * + * @param[in] storev + * Specifies whether the sums are made per column or row. + * = ChamColumnwise: Computes the sum of values on each column + * = ChamRowwise: Computes the sum of values on each row + * = ChamEltwise: Computes the sum of values on all the matrix + * + * @param[in] M + * The number of rows in the tile A. + * + * @param[in] N + * The number of columns in the tile A. + * + * @param[in] A + * The M-by-N matrix on which to compute the norm. + * + * @param[in] LDA + * The leading dimension of the tile A. LDA >= max(1,M). + * + * @param[out] sum resulting sums + * Dimension: (1,K) + * if storev == ChamColumnwise, K = N + * if storev == ChamRowwise, K = M + * if storev == ChamEltwise, K = 1 + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval -k, the k-th argument had an illegal value + * + */ +int CORE_zgesum( cham_store_t storev, int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *sum ) +{ + int i; + int K; + + /* initialize pairs scale, sumsquare if not already done */ + if ( storev == ChamColumnwise ) { + K = N; + } else if ( storev == ChamRowwise ) { + K = M; + } else { + K = 1; + } + for (i=0; i<K; i+=1) { + sum[i] = 0.; + } + + if (storev == ChamColumnwise) { + CORE_zgesum_col( M, N, A, LDA, sum ); + } + else if (storev == ChamRowwise) { + CORE_zgesum_row( M, N, A, LDA, sum ); + } + else { + CORE_zgesum_elt( M, N, A, LDA, sum ); + } + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_zgram.c b/coreblas/compute/core_zgram.c index d40ba792b2ceb0979bb8a9a6868f5d3a4f31ac46..4d89735b33676b83f9346c0d0b5e0a39bbf1f333 100644 --- a/coreblas/compute/core_zgram.c +++ b/coreblas/compute/core_zgram.c @@ -67,7 +67,7 @@ * * @param[in,out] A * On entry, the M-by-N tile A. - * On exit, updated by the application of L. + * On exit, updated by the Gram matrix coefficients. * * @param[in] LDA * The leading dimension of the array A. LDA >= max(1,M). diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 6c28b72d322c84a18b14cfbd387328d9c48652c7..31b1ee49599e53f619e206c164677c066499a18b 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -1007,6 +1007,39 @@ TCORE_zunmqr( cham_side_t side, side, trans, M, N, K, IB, CHAM_tile_get_ptr( V ), V->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, LDWORK ); } +int +TCORE_zgesum( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sum ) +{ + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( sum->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return CORE_zgesum( storev, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sum ) ); +} + +int +TCORE_zcesca( int center, + int scale, + cham_store_t axis, + int M, + int N, + int Mt, + int Nt, + const CHAM_tile_t *Gi, + const CHAM_tile_t *Gj, + const CHAM_tile_t *G, + const CHAM_tile_t *Di, + const CHAM_tile_t *Dj, + CHAM_tile_t * A ) +{ + assert( Gi->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( Gj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( G->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( Di->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( Dj->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + return CORE_zcesca( + center, scale, axis, M, N, Mt, Nt, CHAM_tile_get_ptr( Gi ), Gi->ld, CHAM_tile_get_ptr( Gj ), Gj->ld, CHAM_tile_get_ptr( G ), CHAM_tile_get_ptr( Di ), Di->ld, CHAM_tile_get_ptr( Dj ), Dj->ld, CHAM_tile_get_ptr( A ), A->ld ); +} + int TCORE_zgram( cham_uplo_t uplo, int M, diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 858cfe090a7548b85aeb0e3b3f8d35761f12080c..9637ba67b5d2bc56bd650346389bae1bb486d360 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -372,8 +372,21 @@ int CORE_zunmqr(cham_side_t side, cham_trans_t trans, CHAMELEON_Complex64_t *WORK, int LDWORK); /** - * Gram prototypes + * Centered-Scale and Gram prototypes */ +int CORE_zgesum( cham_store_t storev, int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *sum ); +int CORE_zcesca( int center, int scale, + cham_store_t axis, + int M, int N, + int Mt, int Nt, + const CHAMELEON_Complex64_t *Gi, int LDGI, + const CHAMELEON_Complex64_t *Gj, int LDGJ, + const CHAMELEON_Complex64_t *G, + const double *Di, int LDDI, + const double *Dj, int LDDJ, + CHAMELEON_Complex64_t *A, int LDA ); int CORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt, const double *Di, int LDDI, const double *Dj, int LDDJ, diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index cec8b2c4c84440490efcdfe3ab8d85c8e510e2a1..10d420339b4e39bbc34fae19d5cc349ed08b0f71 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -80,6 +80,8 @@ int TCORE_ztsmqr_hetra1( cham_side_t side, cham_trans_t trans, int m1, int n1, int TCORE_ztstrf( int M, int N, int IB, int NB, CHAM_tile_t *U, CHAM_tile_t *A, CHAM_tile_t *L, int *IPIV, CHAMELEON_Complex64_t *WORK, int LDWORK, int *INFO ); int TCORE_zunmlq( cham_side_t side, cham_trans_t trans, int M, int N, int IB, int K, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int LDWORK ); int TCORE_zunmqr( cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int LDWORK ); -int TCORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt, const CHAM_tile_t *Di, const CHAM_tile_t *Dj, const CHAM_tile_t *D, CHAM_tile_t *A ); +int TCORE_zgesum( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sum ); +int TCORE_zcesca( int center, int scale, cham_store_t axis, int M, int N, int Mt, int Nt, const CHAM_tile_t *Gi, const CHAM_tile_t *Gj, const CHAM_tile_t *G, const CHAM_tile_t *Di, const CHAM_tile_t *Dj, CHAM_tile_t *A ); +int TCORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt, const CHAM_tile_t *Di, const CHAM_tile_t *Dj, const CHAM_tile_t *D, CHAM_tile_t *A ); #endif /* _coreblas_ztile_h_ */ diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org index f0cb97f5dc6265761c961cbbbefc44fa93b43ee0..74be7d6be8537a9d3f0cafb28f42c7f32c73d6e1 100644 --- a/doc/orgmode/chapters/using.org +++ b/doc/orgmode/chapters/using.org @@ -768,6 +768,11 @@ * *EVD/SVD* * gesvd: singular value decomposition * heevd: eigenvalues/eigenvectors computation with A Hermitian + * *Specific Matrix transformation for Data Analysis* + * cesca: centered-scaled matrix transformation, pretreatment algorithm + for Principal Component Analysis + * gram: Gram matrix transformation, pretreatment algorithm for + Multidimensional Scaling * *Extra routines* * *Norms* * lange: compute norm of a matrix (Max, One, Inf, Frobenius) diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 63562bbab69fdf21eb765c19b3c7ccf48dda8c8b..119bb25bf7056030d850535f2a7069b134752d09 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -306,6 +306,8 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s */ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); void CHAMELEON_zgemm_WS_Free( void *ws ); +void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A ); +void CHAMELEON_zcesca_WS_Free( void *ws ); void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A ); void CHAMELEON_zgram_WS_Free( void *ws ); @@ -352,6 +354,12 @@ int CHAMELEON_zbuild(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, i int CHAMELEON_zbuild_Tile(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback ); int CHAMELEON_zbuild_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +/** + * Centered-Scaled function prototypes + */ +int CHAMELEON_zcesca(int center, int scale, cham_store_t axis, int M, int N, CHAMELEON_Complex64_t *A, int LDA ); +int CHAMELEON_zcesca_Tile( int center, int scale, cham_store_t axis, CHAM_desc_t *A ); +int CHAMELEON_zcesca_Tile_Async( int center, int scale, cham_store_t axis, CHAM_desc_t *A, void *user_ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); /** * Gram function prototypes */ diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index bc13dbd6471dc4c322c198aaf50d4306e58fdd4d..d89c00df4d5a0c1405584fe630a6ee846bb5939a 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -429,8 +429,21 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options, } /** - * Gram prototype + * Centered-Scaled and Gram prototype */ +void INSERT_TASK_zgesum( const RUNTIME_option_t *options, + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ); +void INSERT_TASK_zcesca( const RUNTIME_option_t *options, + int center, int scale, cham_store_t axis, + int m, int n, int mt, int nt, + const CHAM_desc_t *Gi, int Gim, int Gin, + const CHAM_desc_t *Gj, int Gjm, int Gjn, + const CHAM_desc_t *G, int Gm, int Gn, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index b3101b81657d82280fd10c234604dc1756aafff8..9ca5edd5c927544480fc82f3a75c78ddbdca4447 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -102,8 +102,10 @@ set(CODELETS_ZSRC ################## codelets/codelet_zbuild.c ################## - # GRAM + # CENTERED-SCALED and GRAM ################## + codelets/codelet_zgesum.c + codelets/codelet_zcesca.c codelets/codelet_zgram.c ) diff --git a/runtime/openmp/codelets/codelet_zcesca.c b/runtime/openmp/codelets/codelet_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..e7724e52bc33f09aa956f1e9dd42fda4a55d2d38 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zcesca.c @@ -0,0 +1,42 @@ +/** + * + * @file openmp/codelet_zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca OpenMP codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> s d c z + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_zcesca( const RUNTIME_option_t *options, + int center, int scale, cham_store_t axis, + int m, int n, int mt, int nt, + const CHAM_desc_t *Gi, int Gim, int Gin, + const CHAM_desc_t *Gj, int Gjm, int Gjn, + const CHAM_desc_t *G, int Gm, int Gn, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + CHAM_desc_t *A, int Am, int An ) +{ + CHAM_tile_t *tileGi = Gi->get_blktile( Gi, Gim, Gin ); + CHAM_tile_t *tileGj = Gj->get_blktile( Gj, Gjm, Gjn ); + CHAM_tile_t *tileG = G->get_blktile( G, Gm, Gn ); + CHAM_tile_t *tileDi = Di->get_blktile( Di, Dim, Din ); + CHAM_tile_t *tileDj = Dj->get_blktile( Dj, Djm, Djn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + +#pragma omp task firstprivate( center, scale, axis, m, n, mt, nt, tileGi, tileGj, tileG, tileDi, tileDj, tileA ) depend( in:tileGi[0], tileGj[0], tileG[0], tileDi[0], tileDj[0] ) depend( inout:tileA[0] ) + TCORE_zcesca( center, scale, axis, m, n, mt, nt, + tileGi, tileGj, tileG, tileDi, tileDj, tileA ); +} diff --git a/runtime/openmp/codelets/codelet_zgesum.c b/runtime/openmp/codelets/codelet_zgesum.c new file mode 100644 index 0000000000000000000000000000000000000000..86b7bfd084fadee98c6218384e396597137e189f --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgesum.c @@ -0,0 +1,31 @@ +/** + * + * @file openmp/codelet_zgesum.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesum OpenMP codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_zgesum( const RUNTIME_option_t *options, + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileScaleSum = SUMS->get_blktile( SUMS, SUMSm, SUMSn ); +#pragma omp task firstprivate( storev, m, n, tileA, tileScaleSum ) depend( in:tileA[0] ) depend( inout:tileScaleSum[0] ) + TCORE_zgesum( storev, m, n, tileA, tileScaleSum ); +} diff --git a/runtime/parsec/codelets/codelet_zcesca.c b/runtime/parsec/codelets/codelet_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..5018014eb6b28ec7ff7df6b02273cc98578d55d3 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zcesca.c @@ -0,0 +1,95 @@ +/** + * + * @file parsec/codelet_zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca PaRSEC codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_zcesca_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + int center, scale; + cham_store_t axis; + int m, n, mt, nt; + CHAMELEON_Complex64_t *Gi; + int ldgi; + CHAMELEON_Complex64_t *Gj; + int ldgj; + CHAMELEON_Complex64_t *G; + double *Di; + int lddi; + double *Dj; + int lddj; + CHAMELEON_Complex64_t *A; + int lda; + + parsec_dtd_unpack_args( + this_task, ¢er, &scale, &axis, &m, &n, &mt, &nt, &Gi, &ldgi, &Gj, &ldgj, &G, &Di, &lddi, &Dj, &lddj, &A, &lda ); + + CORE_zcesca( center, scale, axis, + m, n, mt, nt, + Gi, ldgi, + Gj, ldgj, + G, + Di, lddi, + Dj, lddj, + A, lda); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + + +void INSERT_TASK_zcesca( const RUNTIME_option_t *options, + int center, int scale, cham_store_t axis, + int m, int n, int mt, int nt, + const CHAM_desc_t *Gi, int Gim, int Gin, + const CHAM_desc_t *Gj, int Gjm, int Gjn, + const CHAM_desc_t *G, int Gm, int Gn, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + CHAM_desc_t *A, int Am, int An) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileGi = Gi->get_blktile( Gi, Gim, Gin ); + CHAM_tile_t *tileGj = Gj->get_blktile( Gj, Gjm, Gjn ); + CHAM_tile_t *tileDi = Di->get_blktile( Di, Dim, Din ); + CHAM_tile_t *tileDj = Dj->get_blktile( Dj, Djm, Djn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zcesca_parsec, options->priority, "cesca", + sizeof(int), ¢er, VALUE, + sizeof(int), &scale, VALUE, + sizeof(int), &axis, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + PASSED_BY_REF, RTBLKADDR( Gi, CHAMELEON_Complex64_t, Gim, Gin ), chameleon_parsec_get_arena_index( Gi ) | INPUT, + sizeof(int), &(tileGi->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( Gj, CHAMELEON_Complex64_t, Gjm, Gjn ), chameleon_parsec_get_arena_index( Gj ) | INPUT, + sizeof(int), &(tileGj->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( G, CHAMELEON_Complex64_t, Gm, Gn ), chameleon_parsec_get_arena_index( G ) | INPUT, + PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, + sizeof(int), &(tileDi->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( Dj, double, Djm, Djn ), chameleon_parsec_get_arena_index( Dj ) | INPUT, + sizeof(int), &(tileDj->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, + sizeof(int), &(tileA->ld), VALUE, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/parsec/codelets/codelet_zgesum.c b/runtime/parsec/codelets/codelet_zgesum.c new file mode 100644 index 0000000000000000000000000000000000000000..c0e1a10dd0539db947d5784ea3c13788d294cda4 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgesum.c @@ -0,0 +1,61 @@ +/** + * + * @file parsec/codelet_zgesum.c + * + * @copyright 2009-2015 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesum PaRSEC codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_zgesum_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_store_t storev; + int m; + int n; + CHAMELEON_Complex64_t *A; + int lda; + CHAMELEON_Complex64_t *SUMS; + + parsec_dtd_unpack_args( + this_task, &storev, &m, &n, &A, &lda, &SUMS ); + + CORE_zgesum( storev, m, n, A, lda, SUMS ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_zgesum( const RUNTIME_option_t *options, + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zgesum_parsec, options->priority, "gessum", + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn ), chameleon_parsec_get_arena_index( SUMS ) | INOUT | AFFINITY, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/quark/codelets/codelet_zcesca.c b/runtime/quark/codelets/codelet_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..6339719f74509d1b36b304a8ff9e086b59041d0a --- /dev/null +++ b/runtime/quark/codelets/codelet_zcesca.c @@ -0,0 +1,65 @@ +/** + * + * @file quark/codelet_zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca Quark codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void CORE_zcesca_quark(Quark *quark) +{ + int center, scale; + cham_store_t axis; + int m, n, mt, nt; + CHAM_tile_t *Gi; + CHAM_tile_t *Gj; + CHAM_tile_t *G; + CHAM_tile_t *Di; + CHAM_tile_t *Dj; + CHAM_tile_t *A; + + quark_unpack_args_13(quark, center, scale, axis, m, n, mt, nt, Gi, Gj, G, Di, Dj, A ); + TCORE_zcesca( center, scale, axis, m, n, mt, nt, Gi, Gj, G, Di, Dj, A ); +} + +void INSERT_TASK_zcesca( const RUNTIME_option_t *options, + int center, int scale, cham_store_t axis, + int m, int n, int mt, int nt, + const CHAM_desc_t *Gi, int Gim, int Gin, + const CHAM_desc_t *Gj, int Gjm, int Gjn, + const CHAM_desc_t *G, int Gm, int Gn, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + CHAM_desc_t *A, int Am, int An ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_CESCA; + QUARK_Insert_Task(opt->quark, CORE_zcesca_quark, (Quark_Task_Flags*)opt, + sizeof(int), ¢er, VALUE, + sizeof(int), &scale, VALUE, + sizeof(int), &axis, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &mt, VALUE, + sizeof(int), &nt, VALUE, + sizeof(void*), RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin), INPUT, + sizeof(void*), RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn), INPUT, + sizeof(void*), RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn ), INPUT, + sizeof(void*), RTBLKADDR(Di, double, Dim, Din), INPUT, + sizeof(void*), RTBLKADDR(Dj, double, Djm, Djn), INPUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An ), INOUT, + 0); +} diff --git a/runtime/quark/codelets/codelet_zgesum.c b/runtime/quark/codelets/codelet_zgesum.c new file mode 100644 index 0000000000000000000000000000000000000000..51b4e427e8a27e75a606e65065475abb0e2a72d3 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgesum.c @@ -0,0 +1,48 @@ +/** + * + * @file quark/codelet_zgesum.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesum Quark codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void CORE_zgesum_quark(Quark *quark) +{ + cham_store_t storev; + int m; + int n; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; + + quark_unpack_args_5( quark, storev, m, n, tileA, tileW ); + TCORE_zgesum( storev, m, n, tileA, tileW ); +} + +void INSERT_TASK_zgesum( const RUNTIME_option_t *options, + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_GESUM; + QUARK_Insert_Task(opt->quark, CORE_zgesum_quark, (Quark_Task_Flags*)opt, + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn), INOUT, + 0); +} diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h index ab5983727d1f532e8920c95f6be4da4f396227a1..cfc82f227b23cb8fb0edce1a1c42a0e34e928e66 100644 --- a/runtime/quark/include/core_blas_dag.h +++ b/runtime/quark/include/core_blas_dag.h @@ -97,6 +97,9 @@ #define DAG_CORE_TRSSQ DAG_SET_PROPERTIES( "TRSSQ" , "white" ) #define DAG_CORE_PLSSQ DAG_SET_PROPERTIES( "PLSSQ" , "white" ) #define DAG_CORE_PLSSQ2 DAG_SET_PROPERTIES( "PLSSQ2" , "white" ) + +#define DAG_CORE_GESUM DAG_SET_PROPERTIES( "GESUM" , "white" ) +#define DAG_CORE_CESCA DAG_SET_PROPERTIES( "CESCA" , "orange" ) #define DAG_CORE_GRAM DAG_SET_PROPERTIES( "GRAM" , "orange" ) #endif /* _core_blas_dag_h_ */ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 369c4cd24ccd2966f80c9c1ccc9d13fccd0d6f25..58e6cb6bf1eabfcc0ecf91c8541a5657a1ac9f9d 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -39,7 +39,9 @@ CHAMELEON_CL_CB(zgessq, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(zgetrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_incpiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_nopiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) -CHAMELEON_CL_CB(zgram, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), 0, M*N) +CHAMELEON_CL_CB(zgesum, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zcesca, cti_handle_get_m(task->handles[5]), cti_handle_get_n(task->handles[5]), 0, 8.*M*N) +CHAMELEON_CL_CB(zgram, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), 0, 8.*M*N) CHAMELEON_CL_CB(zhe2ge, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (1./2.0)*M*N) CHAMELEON_CL_CB(zherfb, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 2. *M* M*M) #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/runtime/starpu/codelets/codelet_zcesca.c b/runtime/starpu/codelets/codelet_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..5dbadec77774438ef701cb9d07b4f7af579094f1 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zcesca.c @@ -0,0 +1,110 @@ +/** + * + * @file starpu/codelet_zcesca.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca StarPU codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-07 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +struct cl_zcesca_args_s { + int center; + int scale; + cham_store_t axis; + int m; + int n; + int mt; + int nt; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zcesca_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zcesca_args_s clargs; + CHAM_tile_t *Gi; + CHAM_tile_t *Gj; + CHAM_tile_t *G; + CHAM_tile_t *Di; + CHAM_tile_t *Dj; + CHAM_tile_t *A; + + Gi = cti_interface_get(descr[0]); + Gj = cti_interface_get(descr[1]); + G = cti_interface_get(descr[2]); + Di = cti_interface_get(descr[3]); + Dj = cti_interface_get(descr[4]); + A = cti_interface_get(descr[5]); + + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zcesca( clargs.center, clargs.scale, clargs.axis, + clargs.m, clargs.n, clargs.mt, clargs.nt, + Gi, Gj, G, Di, Dj, A ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zcesca, cl_zcesca_cpu_func) + +void INSERT_TASK_zcesca( const RUNTIME_option_t *options, + int center, int scale, cham_store_t axis, + int m, int n, int mt, int nt, + const CHAM_desc_t *Gi, int Gim, int Gin, + const CHAM_desc_t *Gj, int Gjm, int Gjn, + const CHAM_desc_t *G, int Gm, int Gn, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + CHAM_desc_t *A, int Am, int An ) +{ + struct cl_zcesca_args_s clargs = { + .center = center, + .scale = scale, + .axis = axis, + .m = m, + .n = n, + .mt = mt, + .nt = nt + }; + struct starpu_codelet *codelet = &cl_zcesca; + void (*callback)(void*) = options->profiling ? cl_zcesca_callback : NULL; + starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); + int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(Gi, Gim, Gin); + CHAMELEON_ACCESS_R(Gj, Gjm, Gjn); + CHAMELEON_ACCESS_R(G, Gm, Gn); + CHAMELEON_ACCESS_R(Di, Dim, Din); + CHAMELEON_ACCESS_R(Dj, Djm, Djn); + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &clargs, sizeof(struct cl_zcesca_args_s), + STARPU_R, RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin), + STARPU_R, RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn), + STARPU_R, RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn), + STARPU_R, RTBLKADDR(Di, double, Dim, Din), + STARPU_R, RTBLKADDR(Dj, double, Djm, Djn), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zcesca", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zgesum.c b/runtime/starpu/codelets/codelet_zgesum.c new file mode 100644 index 0000000000000000000000000000000000000000..31a6ca93e5c8f9fc74a12e656309ca3059cb731d --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgesum.c @@ -0,0 +1,79 @@ +/** + * + * @file starpu/codelet_zgesum.c + * + * @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgesum StarPU codelet + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-7 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +struct cl_zgesum_args_s { + cham_store_t storev; + int m; + int n; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgesum_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zgesum_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; + + tileA = cti_interface_get(descr[0]); + tileW = cti_interface_get(descr[1]); + + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zgesum( clargs.storev, clargs.m, clargs.n, tileA, tileW ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgesum, cl_zgesum_cpu_func) + +void INSERT_TASK_zgesum( const RUNTIME_option_t *options, + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) +{ + struct cl_zgesum_args_s clargs = { + .storev = storev, + .m = m, + .n = n + }; + struct starpu_codelet *codelet = &cl_zgesum; + void (*callback)(void*) = options->profiling ? cl_zgesum_callback : NULL; + starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); + int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_RW(SUMS, SUMSm, SUMSn); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &clargs, sizeof(struct cl_zgesum_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgesum", +#endif + 0); +} diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index e246aabc62c07b62d417146963805ad1928acfdd..3b41ad7813d88b1d041b838d772e9f8a94a45ea5 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -122,6 +122,12 @@ CODELETS_HEADER(dzasum); */ CODELETS_HEADER(zplrnt); CODELETS_HEADER(zbuild); + +/* + * centered-scaled and gram functions + */ +CODELETS_HEADER(zgesum); +CODELETS_HEADER(zcesca); CODELETS_HEADER(zgram); #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index 1079e455cd9f8f17ecb33d4e5a3317efdb8c50b3..78f4f507ef8a896f92a48b8a3a9c50c6ccd359ea 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -101,6 +101,8 @@ set(ZSRC testing_zgepdf_qdwh.c testing_zgepdf_qr.c testing_zplrnk.c + testing_zcesca.c + testing_zgram.c ) diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 9ca61431a485b6eb9251d208ddf5d2581b549f53..a749e7a47aaaa2209348add0131599af9eaae794 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -66,6 +66,8 @@ if (NOT CHAMELEON_SIMULATION) gels_hqr ) set( TESTS ${TESTS} genm2 gepdf_qr gepdf_qdwh ) + set( TESTS ${TESTS} + cesca gram ) foreach(cat ${TEST_CATEGORIES}) foreach(gpus ${N_GPUS}) diff --git a/testing/input/cesca.in b/testing/input/cesca.in new file mode 100644 index 0000000000000000000000000000000000000000..2f6f33fd9fa543c85c187523e2acffd2ae0cf1c1 --- /dev/null +++ b/testing/input/cesca.in @@ -0,0 +1,16 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# CESCA +# nb: Tile size +# ib: Inner tile size +# M: Number of rows of matrix A +# N: Number of columns of matrix A +# LDA: Leading dimension of matrix A + +op = cesca +nb = 16, 17 +ib = 8 +m = 15, 17, 33 +n = 13, 19, 35 +lda = 37 diff --git a/testing/input/gram.in b/testing/input/gram.in new file mode 100644 index 0000000000000000000000000000000000000000..0088b7edee3139d80a23e84cb28dbde220336654 --- /dev/null +++ b/testing/input/gram.in @@ -0,0 +1,17 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# GRAM + +# nb: Tile size +# ib: Inner tile size +# n: Order of the matrix A +# lda: Leading dimension of matrix A +# uplo: Matrix part to be considered (0: Upper, 1: Lower) + +op = gram +nb = 16, 17 +ib = 8 +n = 15, 19, 37 +lda = 41 +uplo = 0,1 diff --git a/testing/testing_zcesca.c b/testing/testing_zcesca.c new file mode 100644 index 0000000000000000000000000000000000000000..b252476e852d1e395ce87eb5fb662e3e56abeeb6 --- /dev/null +++ b/testing/testing_zcesca.c @@ -0,0 +1,98 @@ +/** + * + * @file testing_zcesca.c + * + * @copyright 2019-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zcesca testing + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-10 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> + +static cham_fixdbl_t +flops_zcesca( int M, int N ) +{ + cham_fixdbl_t flops = 0.; +#if defined( PRECISION_z ) || defined( PRECISION_c ) + /* 2 multiplications and 5 addition per element */ + flops = ( 2. * 6. + 10. ) * M * N; +#else + flops = ( 2. + 5. ) * M * N; +#endif + + return flops; +} + +int +testing_zcesca( run_arg_list_t *args, int check ) +{ + int hres = 0; + CHAM_desc_t *descA; + + /* Read arguments */ + intptr_t mtxfmt = parameters_getvalue_int( "mtxfmt" ); + int nb = run_arg_get_int( args, "nb", 320 ); + int P = parameters_getvalue_int( "P" ); + int N = run_arg_get_int( args, "N", 1000 ); + int M = run_arg_get_int( args, "M", N ); + int LDA = run_arg_get_int( args, "LDA", M ); + int seedA = run_arg_get_int( args, "seedA", random() ); + int Q = parameters_compute_q( P ); + cham_fixdbl_t t, gflops; + cham_fixdbl_t flops = flops_zcesca( M, N ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Create the matrices */ + CHAMELEON_Desc_Create( + &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q ); + + /* Fill the matrix with random values */ + CHAMELEON_zplrnt_Tile( descA, seedA ); + + /* Compute the centered-scaled matrix transformation */ + START_TIMING( t ); + hres = CHAMELEON_zcesca_Tile( 1, 1, ChamColumnwise, descA ); + STOP_TIMING( t ); + gflops = flops * 1.e-9 / t; + run_arg_add_fixdbl( args, "time", t ); + run_arg_add_fixdbl( args, "gflops", ( hres == CHAMELEON_SUCCESS ) ? gflops : -1. ); + + CHAMELEON_Desc_Destroy( &descA ); + + return hres; +} + +testing_t test_zcesca; +const char *zcesca_params[] = { "mtxfmt", "nb", "trans", "m", "n", "lda", "seedA", NULL }; +const char *zcesca_output[] = { NULL }; +const char *zcesca_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zcesca_init( void ) __attribute__( ( constructor ) ); +void +testing_zcesca_init( void ) +{ + test_zcesca.name = "zcesca"; + test_zcesca.helper = "General centered-scaled matrix transformation"; + test_zcesca.params = zcesca_params; + test_zcesca.output = zcesca_output; + test_zcesca.outchk = zcesca_outchk; + test_zcesca.fptr = testing_zcesca; + test_zcesca.next = NULL; + + testing_register( &test_zcesca ); +} diff --git a/testing/testing_zgram.c b/testing/testing_zgram.c new file mode 100644 index 0000000000000000000000000000000000000000..abd27f011e5b5c659c16b9563fb6fb4bb4be10ac --- /dev/null +++ b/testing/testing_zgram.c @@ -0,0 +1,98 @@ +/** + * + * @file testing_zgram.c + * + * @copyright 2019-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgram testing + * + * @version 1.1.0 + * @author Florent Pruvost + * @date 2021-05-10 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#include "testings.h" +#include "testing_zcheck.h" +#include <chameleon/flops.h> + +static cham_fixdbl_t +flops_zgram( int N ) +{ + cham_fixdbl_t flops = 0.; +#if defined( PRECISION_z ) || defined( PRECISION_c ) + /* 5 multiplications and 3 addition per element */ + flops = ( 5. * 6. + 6. ) * N * N; +#else + flops = ( 5. + 3. ) * N * N; +#endif + + return flops; +} + +int +testing_zgram( run_arg_list_t *args, int check ) +{ + int hres = 0; + CHAM_desc_t *descA; + + /* Read arguments */ + intptr_t mtxfmt = parameters_getvalue_int( "mtxfmt" ); + int nb = run_arg_get_int( args, "nb", 320 ); + cham_uplo_t uplo = run_arg_get_uplo( args, "uplo", ChamUpper ); + int P = parameters_getvalue_int( "P" ); + int N = run_arg_get_int( args, "N", 1000 ); + int LDA = run_arg_get_int( args, "LDA", N ); + int seedA = run_arg_get_int( args, "seedA", random() ); + int Q = parameters_compute_q( P ); + cham_fixdbl_t t, gflops; + cham_fixdbl_t flops = flops_zgram( N ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Create the matrices */ + CHAMELEON_Desc_Create( + &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, N, N, P, Q ); + + /* Fill the matrix with random values */ + CHAMELEON_zplghe_Tile( (double)N, uplo, descA, seedA ); + + /* Compute the gram matrix transformation */ + START_TIMING( t ); + hres = CHAMELEON_zgram_Tile( uplo, descA ); + STOP_TIMING( t ); + gflops = flops * 1.e-9 / t; + run_arg_add_fixdbl( args, "time", t ); + run_arg_add_fixdbl( args, "gflops", ( hres == CHAMELEON_SUCCESS ) ? gflops : -1. ); + + CHAMELEON_Desc_Destroy( &descA ); + + return hres; +} + +testing_t test_zgram; +const char *zgram_params[] = { "mtxfmt", "nb", "uplo", "n", "n", "lda", "seedA", NULL }; +const char *zgram_output[] = { NULL }; +const char *zgram_outchk[] = { "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgram_init( void ) __attribute__( ( constructor ) ); +void +testing_zgram_init( void ) +{ + test_zgram.name = "zgram"; + test_zgram.helper = "General Gram matrix transformation"; + test_zgram.params = zgram_params; + test_zgram.output = zgram_output; + test_zgram.outchk = zgram_outchk; + test_zgram.fptr = testing_zgram; + test_zgram.next = NULL; + + testing_register( &test_zgram ); +}