diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index d6908a87c2081353336d2615bbc040a0cddff030..18a79e3000ca3c1018d49ce7d34254c7baae3eae 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -20,6 +20,7 @@ _extra_blas = [ ('', 'slaran', 'dlaran', 'slaran', 'dlaran' ), ('', 'slatm1', 'dlatm1', 'clatm1', 'zlatm1' ), ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), + ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), ] diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index cdf6d90212e850c6ccfb3444d8e2ae1dbf9de42b..9f2c8ab6f22fddcca9a9eb7ecba132dc32a23e50 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -251,6 +251,9 @@ set(ZSRC ################## zgram.c pzgram.c + ################## + zgenm2.c + pzgenm2.c ) precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}" diff --git a/compute/pzgenm2.c b/compute/pzgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..e054fa2a0e515009b801c6f67382d65c9c9cff5a --- /dev/null +++ b/compute/pzgenm2.c @@ -0,0 +1,418 @@ +/** + * + * @file pzgenm2.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon parallel algorithm to compute 2-norm estimator. + * + * @version 1.0.0 + * @author Mathieu Faverge + * @author Dalal Sukkari + * @date 2020-10-06 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#define A(m, n) A, m, n +#define X(m, n) &X, m, n +#define SX(m, n) &SX, m, n +#define NRMX( m, n) &NRMX, m, n +#define NRMSX(m, n) &NRMSX, m, n +#define DROW(m, n) &DROW, m, n + +void +chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t X, SX, NRMX, NRMSX, DROW; + int m, n, k; + int myp = A->myrank / A->q; + int myq = A->myrank % A->q; + int tempmm, tempnn; + int cnt, maxiter; + double e0, normx, normsx, beta, scl; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + /* Initialize the result */ + *result = 0.0; + + /* Init workspace handle for the call to dlange but unused */ + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + /** + * DROW must be allocated with GLOBAL to avoid destruction bewteen the first + * stage and the second one during Flush. + * This is the same issue for X and SX to be reused from one iteration to another. + */ + chameleon_desc_init( &DROW, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, A->nb, A->nb, + A->p, A->n, 0, 0, A->p, A->n, A->p, A->q, + NULL, NULL, NULL ); + /** + * NRMX must be allocated with GLOBAL to be able to access the norm value + * after flushing the descriptor. + * This is the same issue for NRMSX. + */ + chameleon_desc_init( &NRMX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->p * 2, A->q, 0, 0, A->p * 2, A->q, A->p, A->q, + NULL, NULL, NULL ); + + /** + * Start by computing the initial vector of the iterative process, and that + * is used to compute the initial norm. + * + * For j=0,n-1, drow[j] = sum( |A_{i,j}|, i=0..m-1 ) + * So drow[j] = sum( S_{p,j}, p=0..P-1 ) with S_{p,j} = sum( |A_{i,j}|, i=0..m-1 \ i%P = p ) + * + */ + for(n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + /* Zeroes the local intermediate vector */ + INSERT_TASK_dlaset( + &options, + ChamUpperLower, 1, tempnn, + 0., 0., + DROW( myp, n ) ); + + /* Computes the sums of the local tiles into the local vector */ + for(m = myp; m < A->mt; m += A->p) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + INSERT_TASK_dzasum( + &options, + ChamColumnwise, ChamUpperLower, tempmm, tempnn, + A(m, n), DROW( myp, n ) ); + } + + /* Reduce on first row of nodes */ + for(m = 1; m < A->p; m++) { + INSERT_TASK_daxpy( + &options, tempnn, 1., + DROW( m, n ), 1, + DROW( 0, n ), 1 ); + } + } + + /** + * Reduce now by columns on processes with (myp == 0) + */ + if ( myp == 0 ) + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMX.mb, NRMX.nb, + 1., 0., + NRMX( myp, myq ) ); + + for( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_dgessq( + &options, ChamEltwise, 1, tempnn, + DROW( myp, n ), + NRMX( myp, myq ) ); + } + + /* Reduce on first row of nodes */ + for(n = 1; n < A->q; n++) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMX( myp, n ), + NRMX( myp, 0 ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMX( myp, 0 ) ); + } + + /* Bcast norm over processes from node (0,0) */ + for(m = 0; m < A->p; m++) { + for(n = 0; n < A->q; n++) { + if ( (m != 0) || (n != 0) ) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMX(0, 0), + NRMX(m, n) ); + } + } + } + + CHAMELEON_Desc_Flush( &DROW, sequence ); + CHAMELEON_Desc_Flush( &NRMX, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + *result = *((double *)(NRMX.get_blkaddr( &NRMX, myp, myq ))); + + if ( (*result) == 0.0 ) { + chameleon_desc_destroy( &DROW ); + chameleon_desc_destroy( &NRMX ); + return; + } else { + normx = *result; + } + + chameleon_desc_init( &NRMSX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->p * 2, A->q, 0, 0, A->p * 2, A->q, A->p, A->q, + NULL, NULL, NULL ); + chameleon_desc_init( &X, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, 1, A->nb, A->nb, + A->p, A->n, 0, 0, A->p, A->n, A->p, A->q, + NULL, NULL, NULL ); + chameleon_desc_init( &SX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, A->mb, 1, A->mb, + A->m, A->q, 0, 0, A->m, A->q, A->p, A->q, + NULL, NULL, NULL ); + + cnt = 0; + e0 = 0.; + maxiter = chameleon_min( 100, A->n ); + + while ( (cnt < maxiter) && + (fabs(*result - e0) > (tol * (*result))) ) + { + e0 = *result; + + /* Initialization of X in the first loop */ + if ( cnt == 0 ) + { + for (n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + if ( myp == 0 ) { +#if defined(PRECISION_z) || defined(PRECISION_c) + INSERT_TASK_dlag2z( + &options, + ChamUpperLower, 1, tempnn, + DROW( 0, n ), + X( 0, n ) ); +#else + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + DROW( 0, n ), + X( 0, n ) ); +#endif + } + + /* Broadcast X */ + for (m = 1; m < A->p; m++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + X( 0, n ), + X( m, n ) ); + } + } + CHAMELEON_Desc_Flush( &DROW, sequence ); + } + + /** + * Scale x = x / ||A|| + * + * We scale the P replica of X, so each row of Q processes possess one + * copy of the scaled X. + */ + scl = 1. / e0; + for (n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + INSERT_TASK_zlascal( + &options, + ChamUpperLower, 1, tempnn, tempnn, + scl, X( myp, n ) ); + } + + /** + * Compute Sx = S * x + */ + for(m = myp; m < A->mt; m+=A->p) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + + for (n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + beta = n == myq ? 0. : 1.; + + INSERT_TASK_zgemv( + &options, + ChamNoTrans, tempmm, tempnn, + 1., A( m, n ), + X( myp, n ), 1, + beta, SX( m, myq ), 1 ); + } + + /* Reduce columns */ + for (k = 1; k < chameleon_min( A->q, A->nt ); k++) { + INSERT_TASK_zaxpy( + &options, tempmm, 1., + SX( m, k ), 1, + SX( m, 0 ), 1 ); + } + /* Broadcast SX to ease the following gemv */ + for (k = 1; k < A->q; k++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, tempmm, 1, tempmm, + SX( m, 0 ), + SX( m, k ) ); + } + } + + /** + * Compute x = S' * S * x = S' * Sx + */ + for ( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + for( m = myp; m < A->mt; m += A->p ) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + beta = m == myp ? 0. : 1.; + + INSERT_TASK_zgemv( + &options, + ChamConjTrans, tempmm, tempnn, + 1., A( m, n ), + SX( m, myq ), 1, + beta, X( myp, n ), 1 ); + } + + /* Reduce rows */ + for (k = 1; k < chameleon_min( A->p, A->mt ); k++) { + INSERT_TASK_zaxpy( + &options, tempnn, 1., + X( k, n ), 1, + X( 0, n ), 1 ); + } + /* Broadcast */ + for (k = 1; k < A->p; k++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + X( 0, n ), + X( k, n ) ); + } + } + + /** + * Compute ||x|| + * + * All rows of Q nodes compute the same thing in parallel due to replication. + */ + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMX.mb, NRMX.nb, + 1., 0., + NRMX( myp, myq ) ); + + for( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + + INSERT_TASK_zgessq( + &options, ChamEltwise, 1, tempnn, + X( myp, n ), + NRMX( myp, myq ) ); + } + + /* Reduce columns */ + for(n = 1; n < chameleon_min( A->q, A->nt ); n++) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMX( myp, n ), + NRMX( myp, 0 ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMX( myp, 0 ) ); + + /* Broadcast the results to processes in the same row */ + for(n = 1; n < A->q; n++) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMX( myp, 0 ), + NRMX( myp, n ) ); + } + } + + /** + * Compute ||Sx|| + * + * All columns of P nodes compute the same thing in parallel due to replication. + */ + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMSX.mb, NRMSX.nb, + 1., 0., + NRMSX( myp, myq ) ); + + for( m = myp; m < A->mt; m += A->p ) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + INSERT_TASK_zgessq( + &options, ChamEltwise, tempmm, 1, + SX( m, myq ), + NRMSX( myp, myq ) ); + } + + /* Reduce rows */ + for( m = 1; m < chameleon_min( A->p, A->mt ); m++ ) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMSX( m, myq ), + NRMSX( 0, myq ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMSX( 0, myq ) ); + + /* Broadcast the results to processes in the same column */ + for(m = 1; m < A->p; m++) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMSX( 0, myq ), + NRMSX( m, myq ) ); + } + } + + CHAMELEON_Desc_Flush( &NRMX, sequence ); + CHAMELEON_Desc_Flush( &NRMSX, sequence ); + CHAMELEON_Desc_Flush( &X, sequence ); + CHAMELEON_Desc_Flush( &SX, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait(chamctxt, sequence); + normx = *((double *)(NRMX.get_blkaddr( &NRMX, myp, myq ))); + normsx = *((double *)(NRMSX.get_blkaddr( &NRMSX, myp, myq ))); + + *result = normx / normsx; + cnt++; + } + + if ( (cnt >= maxiter) && + (fabs((*result) - e0) > (tol * (*result))) ) { + *result = INFINITY; + } + + RUNTIME_options_ws_free( &options ); + RUNTIME_options_finalize( &options, chamctxt ); + + chameleon_desc_destroy( &DROW ); + chameleon_desc_destroy( &NRMX ); + chameleon_desc_destroy( &NRMSX ); + chameleon_desc_destroy( &X ); + chameleon_desc_destroy( &SX ); + + return; +} diff --git a/compute/zgenm2.c b/compute/zgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..7f27a44d647a461d7e2e8bd76d4787310ec6d322 --- /dev/null +++ b/compute/zgenm2.c @@ -0,0 +1,266 @@ +/** + * + * @file zgenm2.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon zgenm2 wrappers + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Returns an estimate of the two-norm of A. + * + ******************************************************************************* + * + * @param[in] tol + * Specify the tolerance used in the iterative algorithm to converge to + * an estimation. Must be > 0. + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. When M = 0, + * the returned value is set to zero. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. When N = 0, + * the returned value is set to zero. + * + * @param[in] A + * The M-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * + * @retval the two-norm estimate. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2_Tile + * @sa CHAMELEON_zgenm2_Tile_Async + * @sa CHAMELEON_clange + * @sa CHAMELEON_dlange + * @sa CHAMELEON_slange + * + */ +double CHAMELEON_zgenm2( double tol, int M, int N, + CHAMELEON_Complex64_t *A, int LDA ) +{ + int NB; + int status; + double value = -1.; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (M < 0) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of M"); + return -1; + } + if (N < 0) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of N"); + return -2; + } + if (LDA < chameleon_max(1, M)) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of LDA"); + return -4; + } + + /* Quick return */ + if (chameleon_min(N, M) == 0) { + return (double)0.0; + } + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_ZGEMM, M, N, 0); + if (status != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgenm2", "chameleon_tune() failed"); + return status; + } + + /* Set NT */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInput, ChamUpperLower, + A, NB, NB, LDA, N, M, N, sequence, &request ); + + /* Call the tile interface */ + CHAMELEON_zgenm2_Tile_Async( tol, &descAt, &value, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInput, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + chameleon_sequence_destroy( chamctxt, sequence ); + return value; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Tile equivalent of CHAMELEON_zgenm2(). + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] tol + * Specify the tolerance used in the iterative algorithm to converge to + * an estimation. Must be > 0. + * + * @param[in] A + * On entry, the input matrix A. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2 + * @sa CHAMELEON_zgenm2_Tile_Async + * @sa CHAMELEON_clange_Tile + * @sa CHAMELEON_dlange_Tile + * @sa CHAMELEON_slange_Tile + * + */ +double CHAMELEON_zgenm2_Tile( double tol, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + double value = -1.; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgenm2_Tile_Async( tol, A, &value, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return ( status == CHAMELEON_SUCCESS ) ? value : (double)status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Non-blocking equivalent of CHAMELEON_zgenm2_Tile(). + * + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2 + * @sa CHAMELEON_zgenm2_Tile + * @sa CHAMELEON_clange_Tile_Async + * @sa CHAMELEON_dlange_Tile_Async + * @sa CHAMELEON_slange_Tile_Async + * + */ +int CHAMELEON_zgenm2_Tile_Async( double tol, CHAM_desc_t *A, double *value, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + if ( tol <= 0. ) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "tol must be > 0"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* Quick return */ + if (chameleon_min(A->m, A->n) == 0) { + *value = 0.0; + return CHAMELEON_SUCCESS; + } + + chameleon_pzgenm2( tol, A, value, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index ae8596e16e6462080cac77ecd46e7314e8f1296f..c17553091863da09e097ff25d09f31dc4b63ddab 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -39,6 +39,7 @@ void chameleon_pzgebrd_gb2bd(cham_uplo_t uplo, CHAM_desc_t *A, double *D, double void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 1b7ca622e2309503e526520b3a9a2efb1ba8b02a..7ebcff28f37b4027a16421fa1302f37b718dc28e 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -39,6 +39,7 @@ int CHAMELEON_zgeadd(cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alp int CHAMELEON_zgelqf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT); int CHAMELEON_zgelqs(int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgels(cham_trans_t trans, int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); +double CHAMELEON_zgenm2( double tol, int M, int N, CHAMELEON_Complex64_t *A, int LDA ); int CHAMELEON_zgemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC); int CHAMELEON_zgeqrf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT); int CHAMELEON_zgeqrs(int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); @@ -114,6 +115,7 @@ int CHAMELEON_zgeadd_Tile(cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_ int CHAMELEON_zgelqf_Tile(CHAM_desc_t *A, CHAM_desc_t *T); int CHAMELEON_zgelqs_Tile(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); int CHAMELEON_zgels_Tile(cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); +double CHAMELEON_zgenm2_Tile( double tol, CHAM_desc_t *A ); int CHAMELEON_zgemm_Tile(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C); int CHAMELEON_zgeqrf_Tile(CHAM_desc_t *A, CHAM_desc_t *T); int CHAMELEON_zgeqrs_Tile(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); @@ -186,6 +188,7 @@ int CHAMELEON_zgeadd_Tile_Async(cham_trans_t trans, CHAMELEON_Complex64_t alpha, int CHAMELEON_zgelqf_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgelqs_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgels_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zgenm2_Tile_Async( double tol, CHAM_desc_t *A, double *value, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zgemm_Tile_Async(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgeqrf_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgeqrs_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index f6591433f46729ac378b23e979cb3d9ba2feb8c6..a233e6d909f773a50ce159e6990ed8b0516fb69c 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -74,6 +74,7 @@ set(ZSRC testing_zsytrf.c testing_zsytrs.c testing_zsysv.c + testing_zgenm2.c testing_zgetrf.c testing_zgetrs.c testing_zgesv.c diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 986fd4faf5974bd2be4e62e8c80f632465f266af..82bef26df95796b6a2c9aa2708126666adf0d362 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -64,6 +64,8 @@ if (NOT CHAMELEON_SIMULATION) #geqrs_hqr gelqs_hqr gels gels_hqr ) + set( TESTS ${TESTS} + genm2 ) foreach(cat ${TEST_CATEGORIES}) foreach(gpus ${N_GPUS}) diff --git a/testing/chameleon_ztesting.c b/testing/chameleon_ztesting.c index 0d6d6e5d93d4ff1c1251f1994ad07ffcb04e28d3..82e3969202896be9c23196fc722c83c6ea182c24 100644 --- a/testing/chameleon_ztesting.c +++ b/testing/chameleon_ztesting.c @@ -69,6 +69,11 @@ static parameter_t parameters[] = { { "seedB", "Seed for the matrix B random generation", 'Y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 11, TestValInt, {0}, NULL, pread_int, sprint_int }, { "seedC", "Seed for the matrix C random generation", 'Z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 11, TestValInt, {0}, NULL, pread_int, sprint_int }, + { NULL, "Matrix generation numerical parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, + { "bump", "Bump value to make a matrix diagonal dominant", 'z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "mode", "Mode that specifies the eigen/singular values in xlatms", -30, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "cond", "Conditional number of the matrix used by xlatms", -31, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValDouble, {0}, NULL, pread_double, sprint_double }, + { NULL, "Operation specific parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "trans", "Value of the trans parameter", -11, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, { "transA", "Value of the transA parameter", -12, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, @@ -79,9 +84,8 @@ static parameter_t parameters[] = { { "norm", "Value of the norm parameter", -17, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestNormtype, {0}, NULL, pread_norm, sprint_norm }, { NULL, "Operation specific scalar", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, - { "alpha", "Value of the scalar alpha", 'x', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, - { "beta", "Value of the scalar beta", 'y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, - { "bump", "Bump value to make a matrix diagonal dominant", 'z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "alpha", "Value of the scalar alpha", 'x', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "beta", "Value of the scalar beta", 'y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, { NULL, "QR/LQ parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "qra", "Size of TS domain (=RH for householder trees)", -20, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int }, diff --git a/testing/input/genm2.in b/testing/input/genm2.in new file mode 100644 index 0000000000000000000000000000000000000000..dc29aa220c8af8dc0bb4a4e0b8e31c05d298f395 --- /dev/null +++ b/testing/input/genm2.in @@ -0,0 +1,20 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# GENM2 +# mtxfmt +# nb: Tile size +# M: Number of rows of matrix A +# N: Number of columns of matrix A +# LDA: Leading dimension of matrix A +# cond: The condition number +# mode: the mode values for latms + +op = genm2 +mtxfmt = 0, 1 +nb = 16, 17 +m = 15, 25, 37 +n = 13, 23, 35 +lda = 41 +cond = 1., 1.e13 +mode = 1:6 diff --git a/testing/testing_zgenm2.c b/testing/testing_zgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..b396d9fee6ee2a4c51542b3bc2a80102b9d72710 --- /dev/null +++ b/testing/testing_zgenm2.c @@ -0,0 +1,140 @@ +/** + * + * @file testing_zgenm2.c + * + * @copyright 2019-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgenm2 testing + * + * @version 1.0.0 + * @author Lucas Barros de Assis + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#if !defined(CHAMELEON_SIMULATION) +#include <coreblas/lapacke.h> +#include <coreblas/cblas.h> +#include <coreblas.h> +#endif +#include "testings.h" +#include "testing_zcheck.h" +#include "flops.h" + +static cham_fixdbl_t +flops_zgenm2( int M, int N ) +{ + double coefabs = 1.; +#if defined( PRECISION_z ) || defined( PRECISION_c ) + coefabs = 3.; +#endif + + return coefabs * M * N; +} + +int +testing_zgenm2( run_arg_list_t *args, int check ) +{ + int hres = 0; + double norm; + CHAM_desc_t *descA; + double *D, dmax = 1.; + + /* Reads arguments */ + intptr_t mtxfmt = parameters_getvalue_int( "mtxfmt" ); + int nb = run_arg_get_int( args, "nb", 320 ); + int P = parameters_getvalue_int( "P" ); + int N = run_arg_get_int( args, "N", 1000 ); + int M = run_arg_get_int( args, "M", N ); + int LDA = run_arg_get_int( args, "LDA", M ); + int seedA = run_arg_get_int( args, "seedA", random() ); + int Q = parameters_compute_q( P ); + int minMN = chameleon_min( M, N ); + double cond = run_arg_get_double( args, "cond", 1.e16 ); + int mode = run_arg_get_int( args, "mode", 4 ); + double tol = 1.e-1; + cham_fixdbl_t t, gflops; + cham_fixdbl_t flops = flops_zgenm2( M, N ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Generate the diagonal of eigen/singular values */ + D = malloc( minMN * sizeof(double) ); +#if !defined(CHAMELEON_SIMULATION) + hres = CORE_dlatm1( mode, cond, 0, ChamDistUniform, seedA, D, minMN ); + if ( hres != 0 ) { + free( D ); + return hres; + } + + /* Save the largest absolute value */ + hres = cblas_idamax( minMN, D, 1 ); + dmax = fabs( D[hres] ); +#endif + + /* Creates the matrix */ + CHAMELEON_Desc_Create( + &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q ); + + /* Fills the matrix with random values */ + hres = CHAMELEON_zlatms_Tile( + ChamDistUniform, seedA, ChamNonsymPosv, D, 0, cond, 0., descA ); + free( D ); + if ( hres != 0 ) { + return hres; + } + + /* Calculates the norm */ + START_TIMING( t ); + norm = CHAMELEON_zgenm2_Tile( tol, descA ); + STOP_TIMING( t ); + gflops = flops * 1.e-9 / t; + run_arg_add_fixdbl( args, "time", t ); + run_arg_add_fixdbl( args, "gflops", ( norm >= 0. ) ? gflops : -1. ); + + /* Checks the solution */ + hres = 0; + if ( check ) { + double res = fabs(dmax - norm) / (dmax * tol); + + run_arg_add_double( args, "||A||", dmax ); + run_arg_add_double( args, "||B||", norm ); + run_arg_add_double( args, "||R||", res ); + + if ( isnan(res) || isinf(res) || (res > 10.0) ) { + hres = 1; + } + } + + CHAMELEON_Desc_Destroy( &descA ); + + return hres; +} + +testing_t test_zgenm2; +const char *zgenm2_params[] = { "mtxfmt", "nb", "m", "n", "lda", "seedA", "cond", "mode", NULL }; +const char *zgenm2_output[] = { NULL }; +const char *zgenm2_outchk[] = { "||A||", "||B||", "||R||", "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgenm2_init( void ) __attribute__( ( constructor ) ); +void +testing_zgenm2_init( void ) +{ + test_zgenm2.name = "zgenm2"; + test_zgenm2.helper = "General matrix two-norm estimator"; + test_zgenm2.params = zgenm2_params; + test_zgenm2.output = zgenm2_output; + test_zgenm2.outchk = zgenm2_outchk; + test_zgenm2.fptr = testing_zgenm2; + test_zgenm2.next = NULL; + + testing_register( &test_zgenm2 ); +}