diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index a4cb1fc264233227fa012b5ce8f38907bdfb0a53..18a79e3000ca3c1018d49ce7d34254c7baae3eae 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -20,6 +20,8 @@ _extra_blas = [ ('', 'slaran', 'dlaran', 'slaran', 'dlaran' ), ('', 'slatm1', 'dlatm1', 'clatm1', 'zlatm1' ), ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), + ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), + ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index cdf6d90212e850c6ccfb3444d8e2ae1dbf9de42b..9f2c8ab6f22fddcca9a9eb7ecba132dc32a23e50 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -251,6 +251,9 @@ set(ZSRC ################## zgram.c pzgram.c + ################## + zgenm2.c + pzgenm2.c ) precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}" diff --git a/compute/pzgenm2.c b/compute/pzgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..e054fa2a0e515009b801c6f67382d65c9c9cff5a --- /dev/null +++ b/compute/pzgenm2.c @@ -0,0 +1,418 @@ +/** + * + * @file pzgenm2.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon parallel algorithm to compute 2-norm estimator. + * + * @version 1.0.0 + * @author Mathieu Faverge + * @author Dalal Sukkari + * @date 2020-10-06 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +#define A(m, n) A, m, n +#define X(m, n) &X, m, n +#define SX(m, n) &SX, m, n +#define NRMX( m, n) &NRMX, m, n +#define NRMSX(m, n) &NRMSX, m, n +#define DROW(m, n) &DROW, m, n + +void +chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t X, SX, NRMX, NRMSX, DROW; + int m, n, k; + int myp = A->myrank / A->q; + int myq = A->myrank % A->q; + int tempmm, tempnn; + int cnt, maxiter; + double e0, normx, normsx, beta, scl; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + /* Initialize the result */ + *result = 0.0; + + /* Init workspace handle for the call to dlange but unused */ + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + /** + * DROW must be allocated with GLOBAL to avoid destruction bewteen the first + * stage and the second one during Flush. + * This is the same issue for X and SX to be reused from one iteration to another. + */ + chameleon_desc_init( &DROW, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, A->nb, A->nb, + A->p, A->n, 0, 0, A->p, A->n, A->p, A->q, + NULL, NULL, NULL ); + /** + * NRMX must be allocated with GLOBAL to be able to access the norm value + * after flushing the descriptor. + * This is the same issue for NRMSX. + */ + chameleon_desc_init( &NRMX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->p * 2, A->q, 0, 0, A->p * 2, A->q, A->p, A->q, + NULL, NULL, NULL ); + + /** + * Start by computing the initial vector of the iterative process, and that + * is used to compute the initial norm. + * + * For j=0,n-1, drow[j] = sum( |A_{i,j}|, i=0..m-1 ) + * So drow[j] = sum( S_{p,j}, p=0..P-1 ) with S_{p,j} = sum( |A_{i,j}|, i=0..m-1 \ i%P = p ) + * + */ + for(n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + /* Zeroes the local intermediate vector */ + INSERT_TASK_dlaset( + &options, + ChamUpperLower, 1, tempnn, + 0., 0., + DROW( myp, n ) ); + + /* Computes the sums of the local tiles into the local vector */ + for(m = myp; m < A->mt; m += A->p) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + INSERT_TASK_dzasum( + &options, + ChamColumnwise, ChamUpperLower, tempmm, tempnn, + A(m, n), DROW( myp, n ) ); + } + + /* Reduce on first row of nodes */ + for(m = 1; m < A->p; m++) { + INSERT_TASK_daxpy( + &options, tempnn, 1., + DROW( m, n ), 1, + DROW( 0, n ), 1 ); + } + } + + /** + * Reduce now by columns on processes with (myp == 0) + */ + if ( myp == 0 ) + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMX.mb, NRMX.nb, + 1., 0., + NRMX( myp, myq ) ); + + for( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_dgessq( + &options, ChamEltwise, 1, tempnn, + DROW( myp, n ), + NRMX( myp, myq ) ); + } + + /* Reduce on first row of nodes */ + for(n = 1; n < A->q; n++) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMX( myp, n ), + NRMX( myp, 0 ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMX( myp, 0 ) ); + } + + /* Bcast norm over processes from node (0,0) */ + for(m = 0; m < A->p; m++) { + for(n = 0; n < A->q; n++) { + if ( (m != 0) || (n != 0) ) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMX(0, 0), + NRMX(m, n) ); + } + } + } + + CHAMELEON_Desc_Flush( &DROW, sequence ); + CHAMELEON_Desc_Flush( &NRMX, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + *result = *((double *)(NRMX.get_blkaddr( &NRMX, myp, myq ))); + + if ( (*result) == 0.0 ) { + chameleon_desc_destroy( &DROW ); + chameleon_desc_destroy( &NRMX ); + return; + } else { + normx = *result; + } + + chameleon_desc_init( &NRMSX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->p * 2, A->q, 0, 0, A->p * 2, A->q, A->p, A->q, + NULL, NULL, NULL ); + chameleon_desc_init( &X, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, 1, A->nb, A->nb, + A->p, A->n, 0, 0, A->p, A->n, A->p, A->q, + NULL, NULL, NULL ); + chameleon_desc_init( &SX, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, A->mb, 1, A->mb, + A->m, A->q, 0, 0, A->m, A->q, A->p, A->q, + NULL, NULL, NULL ); + + cnt = 0; + e0 = 0.; + maxiter = chameleon_min( 100, A->n ); + + while ( (cnt < maxiter) && + (fabs(*result - e0) > (tol * (*result))) ) + { + e0 = *result; + + /* Initialization of X in the first loop */ + if ( cnt == 0 ) + { + for (n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + if ( myp == 0 ) { +#if defined(PRECISION_z) || defined(PRECISION_c) + INSERT_TASK_dlag2z( + &options, + ChamUpperLower, 1, tempnn, + DROW( 0, n ), + X( 0, n ) ); +#else + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + DROW( 0, n ), + X( 0, n ) ); +#endif + } + + /* Broadcast X */ + for (m = 1; m < A->p; m++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + X( 0, n ), + X( m, n ) ); + } + } + CHAMELEON_Desc_Flush( &DROW, sequence ); + } + + /** + * Scale x = x / ||A|| + * + * We scale the P replica of X, so each row of Q processes possess one + * copy of the scaled X. + */ + scl = 1. / e0; + for (n = myq; n < A->nt; n += A->q) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + INSERT_TASK_zlascal( + &options, + ChamUpperLower, 1, tempnn, tempnn, + scl, X( myp, n ) ); + } + + /** + * Compute Sx = S * x + */ + for(m = myp; m < A->mt; m+=A->p) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + + for (n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + beta = n == myq ? 0. : 1.; + + INSERT_TASK_zgemv( + &options, + ChamNoTrans, tempmm, tempnn, + 1., A( m, n ), + X( myp, n ), 1, + beta, SX( m, myq ), 1 ); + } + + /* Reduce columns */ + for (k = 1; k < chameleon_min( A->q, A->nt ); k++) { + INSERT_TASK_zaxpy( + &options, tempmm, 1., + SX( m, k ), 1, + SX( m, 0 ), 1 ); + } + /* Broadcast SX to ease the following gemv */ + for (k = 1; k < A->q; k++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, tempmm, 1, tempmm, + SX( m, 0 ), + SX( m, k ) ); + } + } + + /** + * Compute x = S' * S * x = S' * Sx + */ + for ( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n - n * A->nb : A->nb; + + for( m = myp; m < A->mt; m += A->p ) { + tempmm = m == A->mt-1 ? A->m - m * A->mb : A->mb; + beta = m == myp ? 0. : 1.; + + INSERT_TASK_zgemv( + &options, + ChamConjTrans, tempmm, tempnn, + 1., A( m, n ), + SX( m, myq ), 1, + beta, X( myp, n ), 1 ); + } + + /* Reduce rows */ + for (k = 1; k < chameleon_min( A->p, A->mt ); k++) { + INSERT_TASK_zaxpy( + &options, tempnn, 1., + X( k, n ), 1, + X( 0, n ), 1 ); + } + /* Broadcast */ + for (k = 1; k < A->p; k++) { + INSERT_TASK_zlacpy( + &options, + ChamUpperLower, 1, tempnn, tempnn, + X( 0, n ), + X( k, n ) ); + } + } + + /** + * Compute ||x|| + * + * All rows of Q nodes compute the same thing in parallel due to replication. + */ + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMX.mb, NRMX.nb, + 1., 0., + NRMX( myp, myq ) ); + + for( n = myq; n < A->nt; n += A->q ) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + + INSERT_TASK_zgessq( + &options, ChamEltwise, 1, tempnn, + X( myp, n ), + NRMX( myp, myq ) ); + } + + /* Reduce columns */ + for(n = 1; n < chameleon_min( A->q, A->nt ); n++) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMX( myp, n ), + NRMX( myp, 0 ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMX( myp, 0 ) ); + + /* Broadcast the results to processes in the same row */ + for(n = 1; n < A->q; n++) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMX( myp, 0 ), + NRMX( myp, n ) ); + } + } + + /** + * Compute ||Sx|| + * + * All columns of P nodes compute the same thing in parallel due to replication. + */ + { + INSERT_TASK_dlaset( + &options, + ChamUpperLower, NRMSX.mb, NRMSX.nb, + 1., 0., + NRMSX( myp, myq ) ); + + for( m = myp; m < A->mt; m += A->p ) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + INSERT_TASK_zgessq( + &options, ChamEltwise, tempmm, 1, + SX( m, myq ), + NRMSX( myp, myq ) ); + } + + /* Reduce rows */ + for( m = 1; m < chameleon_min( A->p, A->mt ); m++ ) { + INSERT_TASK_dplssq( + &options, ChamEltwise, 1, 1, + NRMSX( m, myq ), + NRMSX( 0, myq ) ); + } + + INSERT_TASK_dplssq2( + &options, 1, NRMSX( 0, myq ) ); + + /* Broadcast the results to processes in the same column */ + for(m = 1; m < A->p; m++) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + NRMSX( 0, myq ), + NRMSX( m, myq ) ); + } + } + + CHAMELEON_Desc_Flush( &NRMX, sequence ); + CHAMELEON_Desc_Flush( &NRMSX, sequence ); + CHAMELEON_Desc_Flush( &X, sequence ); + CHAMELEON_Desc_Flush( &SX, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait(chamctxt, sequence); + normx = *((double *)(NRMX.get_blkaddr( &NRMX, myp, myq ))); + normsx = *((double *)(NRMSX.get_blkaddr( &NRMSX, myp, myq ))); + + *result = normx / normsx; + cnt++; + } + + if ( (cnt >= maxiter) && + (fabs((*result) - e0) > (tol * (*result))) ) { + *result = INFINITY; + } + + RUNTIME_options_ws_free( &options ); + RUNTIME_options_finalize( &options, chamctxt ); + + chameleon_desc_destroy( &DROW ); + chameleon_desc_destroy( &NRMX ); + chameleon_desc_destroy( &NRMSX ); + chameleon_desc_destroy( &X ); + chameleon_desc_destroy( &SX ); + + return; +} diff --git a/compute/zgenm2.c b/compute/zgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..7f27a44d647a461d7e2e8bd76d4787310ec6d322 --- /dev/null +++ b/compute/zgenm2.c @@ -0,0 +1,266 @@ +/** + * + * @file zgenm2.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. + * + *** + * + * @brief Chameleon zgenm2 wrappers + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> s d c + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Returns an estimate of the two-norm of A. + * + ******************************************************************************* + * + * @param[in] tol + * Specify the tolerance used in the iterative algorithm to converge to + * an estimation. Must be > 0. + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. When M = 0, + * the returned value is set to zero. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. When N = 0, + * the returned value is set to zero. + * + * @param[in] A + * The M-by-N matrix A. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + ******************************************************************************* + * + * @retval the two-norm estimate. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2_Tile + * @sa CHAMELEON_zgenm2_Tile_Async + * @sa CHAMELEON_clange + * @sa CHAMELEON_dlange + * @sa CHAMELEON_slange + * + */ +double CHAMELEON_zgenm2( double tol, int M, int N, + CHAMELEON_Complex64_t *A, int LDA ) +{ + int NB; + int status; + double value = -1.; + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t descAl, descAt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (M < 0) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of M"); + return -1; + } + if (N < 0) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of N"); + return -2; + } + if (LDA < chameleon_max(1, M)) { + chameleon_error("CHAMELEON_zgenm2", "illegal value of LDA"); + return -4; + } + + /* Quick return */ + if (chameleon_min(N, M) == 0) { + return (double)0.0; + } + + /* Tune NB depending on M, N & NRHS; Set NBNB */ + status = chameleon_tune(CHAMELEON_FUNC_ZGEMM, M, N, 0); + if (status != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgenm2", "chameleon_tune() failed"); + return status; + } + + /* Set NT */ + NB = CHAMELEON_NB; + + chameleon_sequence_create( chamctxt, &sequence ); + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInput, ChamUpperLower, + A, NB, NB, LDA, N, M, N, sequence, &request ); + + /* Call the tile interface */ + CHAMELEON_zgenm2_Tile_Async( tol, &descAt, &value, sequence, &request ); + + /* Submit the matrix conversion back */ + chameleon_ztile2lap( chamctxt, &descAl, &descAt, + ChamDescInput, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chamctxt, sequence ); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); + + chameleon_sequence_destroy( chamctxt, sequence ); + return value; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Tile equivalent of CHAMELEON_zgenm2(). + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] tol + * Specify the tolerance used in the iterative algorithm to converge to + * an estimation. Must be > 0. + * + * @param[in] A + * On entry, the input matrix A. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2 + * @sa CHAMELEON_zgenm2_Tile_Async + * @sa CHAMELEON_clange_Tile + * @sa CHAMELEON_dlange_Tile + * @sa CHAMELEON_slange_Tile + * + */ +double CHAMELEON_zgenm2_Tile( double tol, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + double value = -1.; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgenm2_Tile_Async( tol, A, &value, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return ( status == CHAMELEON_SUCCESS ) ? value : (double)status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Non-blocking equivalent of CHAMELEON_zgenm2_Tile(). + * + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgenm2 + * @sa CHAMELEON_zgenm2_Tile + * @sa CHAMELEON_clange_Tile_Async + * @sa CHAMELEON_dlange_Tile_Async + * @sa CHAMELEON_slange_Tile_Async + * + */ +int CHAMELEON_zgenm2_Tile_Async( double tol, CHAM_desc_t *A, double *value, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgenm2_Tile", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + if ( tol <= 0. ) { + chameleon_error("CHAMELEON_zgenm2_Tile_Async", "tol must be > 0"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* Quick return */ + if (chameleon_min(A->m, A->n) == 0) { + *value = 0.0; + return CHAMELEON_SUCCESS; + } + + chameleon_pzgenm2( tol, A, value, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/compute/zlange.c b/compute/zlange.c index 93f8208e1d85a80d6ac7faa30429346d18f22654..418d44bfbb9f3c6ead899c32f4b9c6e374a167c4 100644 --- a/compute/zlange.c +++ b/compute/zlange.c @@ -6,16 +6,16 @@ * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. + * @copyright 2016-2020 KAUST. All rights reserved. * *** * * @brief Chameleon zlange wrappers * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge - * @date 2020-03-03 + * @author Hatem Ltaief + * @date 2020-10-14 * @precisions normal z -> s d c * */ @@ -26,8 +26,9 @@ * * @ingroup CHAMELEON_Complex64_t * - * CHAMELEON_zlange returns the value + * @brief Return the norm of A. * + * CHAMELEON_zlange returns the value * zlange = ( max(abs(A(i,j))), NORM = ChamMaxNorm * ( * ( norm1(A), NORM = ChamOneNorm @@ -35,12 +36,17 @@ * ( normI(A), NORM = ChamInfNorm * ( * ( normF(A), NORM = ChamFrobeniusNorm + * ( + * ( norm2(A), NORM = ChamTwoNorm + * + * where norm1 denotes the one norm of a matrix (maximum column sum), normI + * denotes the infinity norm of a matrix (maximum row sum), normF denotes the + * Frobenius norm of a matrix (square root of sum of squares) and norm2 denotes + * an estimator of the maximum singular value. Note that max(abs(A(i,j))) is + * not a consistent matrix norm. * - * where norm1 denotes the one norm of a matrix (maximum column sum), - * normI denotes the infinity norm of a matrix (maximum row sum) and - * normF denotes the Frobenius norm of a matrix (square root of sum - * of squares). Note that max(abs(A(i,j))) is not a consistent matrix - * norm. + * Note that ChamTwoNorm returns an estimate as computed by CHAMELEON_zgenm2() + * with a tolerance of 1.e-1. * ******************************************************************************* * @@ -49,6 +55,7 @@ * = ChamOneNorm: One norm * = ChamInfNorm: Infinity norm * = ChamFrobeniusNorm: Frobenius norm + * = ChamTwoNorm: Twonorm * * @param[in] M * The number of rows of the matrix A. M >= 0. When M = 0, @@ -77,8 +84,8 @@ * @sa CHAMELEON_slange * */ -double CHAMELEON_zlange(cham_normtype_t norm, int M, int N, - CHAMELEON_Complex64_t *A, int LDA ) +double CHAMELEON_zlange( cham_normtype_t norm, int M, int N, + CHAMELEON_Complex64_t *A, int LDA ) { int NB; int status; @@ -94,8 +101,12 @@ double CHAMELEON_zlange(cham_normtype_t norm, int M, int N, return CHAMELEON_ERR_NOT_INITIALIZED; } /* Check input arguments */ - if ( (norm != ChamMaxNorm) && (norm != ChamOneNorm) - && (norm != ChamInfNorm) && (norm != ChamFrobeniusNorm) ) { + if ( (norm != ChamMaxNorm) && + (norm != ChamOneNorm) && + (norm != ChamInfNorm) && + (norm != ChamFrobeniusNorm) && + (norm != ChamTwoNorm) ) + { chameleon_error("CHAMELEON_zlange", "illegal value of norm"); return -1; } @@ -130,14 +141,14 @@ double CHAMELEON_zlange(cham_normtype_t norm, int M, int N, /* Submit the matrix conversion */ chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInput, ChamUpperLower, - A, NB, NB, LDA, N, M, N, sequence, &request ); + A, NB, NB, LDA, N, M, N, sequence, &request ); /* Call the tile interface */ CHAMELEON_zlange_Tile_Async( norm, &descAt, &value, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, - ChamDescInput, ChamUpperLower, sequence, &request ); + ChamDescInput, ChamUpperLower, sequence, &request ); chameleon_sequence_wait( chamctxt, sequence ); @@ -153,10 +164,11 @@ double CHAMELEON_zlange(cham_normtype_t norm, int M, int N, * * @ingroup CHAMELEON_Complex64_t_Tile * - * CHAMELEON_zlange_Tile - Tile equivalent of CHAMELEON_zlange(). - * Operates on matrices stored by tiles. - * All matrices are passed through descriptors. - * All dimensions are taken from the descriptors. + * @brief Tile equivalent of CHAMELEON_zlange(). + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. * ******************************************************************************* * @@ -216,9 +228,11 @@ double CHAMELEON_zlange_Tile( cham_normtype_t norm, CHAM_desc_t *A ) * * @ingroup CHAMELEON_Complex64_t_Tile_Async * - * CHAMELEON_zlange_Tile_Async - Non-blocking equivalent of CHAMELEON_zlange_Tile(). - * May return before the computation is finished. - * Allows for pipelining of operations at runtime. + * @brief Non-blocking equivalent of CHAMELEON_zlange_Tile(). + * + * @Warning Note that this algorithm includes a RUNTIME_Sequence_wait to cleanup + * workspaces and thus, will return only when the result is computed. It does + * not allow to do pipelining. * ******************************************************************************* * @@ -239,7 +253,7 @@ double CHAMELEON_zlange_Tile( cham_normtype_t norm, CHAM_desc_t *A ) * */ int CHAMELEON_zlange_Tile_Async( cham_normtype_t norm, CHAM_desc_t *A, double *value, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; @@ -285,7 +299,12 @@ int CHAMELEON_zlange_Tile_Async( cham_normtype_t norm, CHAM_desc_t *A, double *v return CHAMELEON_SUCCESS; } - chameleon_pzlange_generic( norm, ChamUpperLower, ChamNonUnit, A, value, sequence, request ); + if ( norm == ChamTwoNorm ) { + chameleon_pzgenm2( 1.e-1, A, value, sequence, request ); + } + else { + chameleon_pzlange_generic( norm, ChamUpperLower, ChamNonUnit, A, value, sequence, request ); + } return CHAMELEON_SUCCESS; } diff --git a/control/compute_z.h b/control/compute_z.h index ae8596e16e6462080cac77ecd46e7314e8f1296f..c17553091863da09e097ff25d09f31dc4b63ddab 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -39,6 +39,7 @@ void chameleon_pzgebrd_gb2bd(cham_uplo_t uplo, CHAM_desc_t *A, double *D, double void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/control/descriptor.c b/control/descriptor.c index 51d2687303b026655966e0232ecb7a3f74bacf4f..6c424bfcb8014a1037484abfba12585d7dc330e7 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -906,8 +906,8 @@ int CHAMELEON_Desc_Release (CHAM_desc_t *desc) { * @retval CHAMELEON_SUCCESS successful exit * */ -int CHAMELEON_Desc_Flush( CHAM_desc_t *desc, - RUNTIME_sequence_t *sequence ) +int CHAMELEON_Desc_Flush( const CHAM_desc_t *desc, + const RUNTIME_sequence_t *sequence ) { RUNTIME_desc_flush( desc, sequence ); return CHAMELEON_SUCCESS; diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index b774d7dad0cef277d2d6029963a7974c61089dda..f0d4976c2b52febd93b491e2890e48f62d0a94e2 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -30,12 +30,14 @@ set(COREBLAS_SRCS_GENERATED "") set(ZSRC + core_dlag2z.c core_dzasum.c core_zaxpy.c core_zgeadd.c core_zlascal.c core_zlatm1.c core_zgelqt.c + core_zgemv.c core_zgemm.c core_zgeqrt.c core_zgesplit.c diff --git a/coreblas/compute/core_dlag2z.c b/coreblas/compute/core_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..498a73c5fbb6f23cca1b74ebe6f3fc8604238e3d --- /dev/null +++ b/coreblas/compute/core_dlag2z.c @@ -0,0 +1,98 @@ +/** + * + * @file core_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_dlag2z CPU kernel + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "coreblas.h" + +/** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Converts a m-by-n matrix A from double real to double complex. + * + ******************************************************************************* + * + * @param[in] M + * The number of rows of the matrix A. + * m >= 0. + * + * @param[in] N + * The number of columns of the matrix A. + * n >= 0. + * + * @param[in] A + * The lda-by-n matrix in double complex precision to convert. + * + * @param[in] LDA + * The leading dimension of the matrix A. + * lda >= max(1,m). + * + * @param[out] B + * On exit, the converted LDB-by-n matrix in single complex precision. + * + * @param[in] LDB + * The leading dimension of the matrix As. + * ldas >= max(1,m). + * + */ +void +CORE_dlag2z( cham_uplo_t uplo, int M, int N, + const double *A, int lda, + CHAMELEON_Complex64_t *B, int ldb ) +{ + const double *Aptr; + CHAMELEON_Complex64_t *Bptr; + int i, j; + + if ( (uplo != ChamUpperLower) && + (uplo != ChamUpper) && + (uplo != ChamLower)) + { + coreblas_error(1, "illegal value of uplo"); + return; + } + + if (M < 0) { + coreblas_error(2, "Illegal value of m"); + return; + } + if (N < 0) { + coreblas_error(3, "Illegal value of n"); + return; + } + if ( (lda < chameleon_max(1,M)) && (M > 0) ) { + coreblas_error(5, "Illegal value of lda"); + return; + } + if ( (ldb < chameleon_max(1,M)) && (M > 0) ) { + coreblas_error(7, "Illegal value of ldb"); + return; + } + + for(j=0; j<N; j++) { + int mmin = ( uplo == ChamLower ) ? j : 0; + int mmax = ( uplo == ChamUpper ) ? chameleon_min(j+1, M) : M; + + Aptr = A + lda * j + mmin; + Bptr = B + ldb * j + mmin; + + for(i=mmin; i<mmax; i++, Aptr++, Bptr++) { + *Bptr = (CHAMELEON_Complex64_t)(*Aptr); + } + } +} diff --git a/coreblas/compute/core_zgemv.c b/coreblas/compute/core_zgemv.c new file mode 100644 index 0000000000000000000000000000000000000000..8c25d3dbab4ca76b80f9de2d73045cbf5b93ef56 --- /dev/null +++ b/coreblas/compute/core_zgemv.c @@ -0,0 +1,98 @@ +/** + * + * @file core_zgemv.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgemv CPU kernel + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> c d s + * + */ +#include "coreblas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * Performs one of the matrix-vector operations + * + * \f[ y = \alpha [op( A ) \times x] + \beta y \f], + * + * where op( A ) is one of: + * \f[ op( A ) = A, \f] + * \f[ op( A ) = A^T, \f] + * \f[ op( A ) = A^H, \f] + * + * alpha and beta are scalars, op(A) an m-by-n matrix, and x and y are two vectors. + * + ******************************************************************************* + * + * @param[in] trans + * - ChamNoTrans: A is not transposed, + * - ChamTrans: A is transposed, + * - ChamConjTrans: A is conjugate transposed. + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. + * + * @param[in] alpha + * The scalar alpha. + * + * @param[in] A + * An lda-by-n matrix, where only the m-by-n leading entries are references. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + * @param[in] x + * X is COMPLEX*16 array, dimension at least + * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. + * Before entry, the incremented array X must contain the + * vector x. + * + * @param[in] incX + * On entry, INCX specifies the increment for the elements of + * X. INCX must not be zero. + * + * @param[in] beta + * The scalar beta. + * + * @param[in] y + * Y is COMPLEX*16 array, dimension at least + * ( 1 + ( n - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' + * and at least + * ( 1 + ( m - 1 )*abs( INCY ) ) otherwise. + * Before entry, the incremented array Y must contain the vector y with + * beta != 0. On exit, Y is overwritten by the updated vector. + * + * @param[in] incY + * On entry, INCY specifies the increment for the elements of + * Y. INCY must not be zero. + * + */ +void CORE_zgemv( cham_trans_t trans, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *x, int incX, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *y, int incY ) +{ + cblas_zgemv( + CblasColMajor, (CBLAS_TRANSPOSE)trans, M, N, + CBLAS_SADDR(alpha), A, LDA, + x, incX, + CBLAS_SADDR(beta), y, incY ); +} diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 173855a3b5bd858e80c0aaec4c823a692f2e3345..f27cebfd2eaa0221e41afeb05ea7c25f6f035a89 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -18,6 +18,18 @@ #include "coreblas.h" #include "coreblas/coreblas_ztile.h" +#if defined( PRECISION_z ) || defined( PRECISION_c ) +void +TCORE_dlag2z( cham_uplo_t uplo, int M, int N, + const CHAM_tile_t *A, + CHAM_tile_t *B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_dlag2z( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); +} +#endif + void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, @@ -71,6 +83,19 @@ TCORE_zgelqt( int M, return CORE_zgelqt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK ); } +void +TCORE_zgemv( cham_trans_t trans, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, + const CHAM_tile_t *x, int incX, + CHAMELEON_Complex64_t beta, CHAM_tile_t *y, int incY ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( x->format & CHAMELEON_TILE_FULLRANK ); + assert( y->format & CHAMELEON_TILE_FULLRANK ); + CORE_zgemv( + trans, M, N, alpha, A->mat, A->ld, x->mat, incX, beta, y->mat, incY ); +} + void TCORE_zgemm( cham_trans_t transA, cham_trans_t transB, diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 111e0b544259f045ddd7785379d7f8127ae483f8..9f317d4e589d85a23ef9b24260dbcb8f254e5b8c 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -30,6 +30,9 @@ /** * Declarations of serial kernels - alphabetical order */ +void CORE_dlag2z( cham_uplo_t uplo, int M, int N, + const double *A, int lda, + CHAMELEON_Complex64_t *B, int ldb ); void CORE_dzasum(cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAMELEON_Complex64_t *A, int lda, double *work); int CORE_zaxpy(int M, CHAMELEON_Complex64_t alpha, @@ -52,6 +55,10 @@ void CORE_zgemm(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, const CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC); +void CORE_zgemv(cham_trans_t trans, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *x, int incX, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *y, int incY); int CORE_zgeqrt(int M, int N, int IB, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *T, int LDT, diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index b5de9f7667d56514c583919f94f52e6a2798ff3a..c754393a0b0e3cef7f906c95a8b051b2864fb43d 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -16,10 +16,14 @@ #ifndef _coreblas_ztile_h_ #define _coreblas_ztile_h_ +#if defined(PRECISION_z) || defined(PRECISION_c) +void TCORE_dlag2z( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +#endif void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, double *work ); int TCORE_zaxpy( int M, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, int incA, CHAM_tile_t *B, int incB ); int TCORE_zgeadd( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *B ); int TCORE_zgelqt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK ); +void TCORE_zgemv( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *x, int incx, CHAMELEON_Complex64_t beta, CHAM_tile_t *y, int incy ); void TCORE_zgemm( cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); int TCORE_zgeqrt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK ); int TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A ); @@ -28,18 +32,18 @@ int TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ); int TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ); int TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ); void TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); void TCORE_zherk( cham_uplo_t uplo, cham_trans_t trans, int N, int K, double alpha, const CHAM_tile_t *A, double beta, CHAM_tile_t *C ); void TCORE_zher2k( cham_uplo_t uplo, cham_trans_t trans, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, double beta, CHAM_tile_t *C ); #endif int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_tile_t *A, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int ldwork ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); #endif void TCORE_zlansy( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); @@ -49,7 +53,7 @@ void TCORE_zlaset( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ); -#if defined(PRECISION_z ) || defined(PRECISION_c) +#if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zplghe( double bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); #endif void TCORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); diff --git a/include/chameleon.h b/include/chameleon.h index ac4c8a8ca521af9b5d01a9d1a07f61a10a0adf06..76849662c9a572668c84a6fa3d9526e4b348ec1a 100644 --- a/include/chameleon.h +++ b/include/chameleon.h @@ -128,7 +128,8 @@ CHAM_desc_t *CHAMELEON_Desc_SubMatrix( CHAM_desc_t *descA, int i, int j, int m, int CHAMELEON_Desc_Destroy( CHAM_desc_t **desc ); int CHAMELEON_Desc_Acquire( CHAM_desc_t *desc ); int CHAMELEON_Desc_Release( CHAM_desc_t *desc ); -int CHAMELEON_Desc_Flush ( CHAM_desc_t *desc, RUNTIME_sequence_t *sequence ); +int CHAMELEON_Desc_Flush ( const CHAM_desc_t *desc, + const RUNTIME_sequence_t *sequence ); /* Workspaces */ int CHAMELEON_Dealloc_Workspace (CHAM_desc_t **desc); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 1b7ca622e2309503e526520b3a9a2efb1ba8b02a..7ebcff28f37b4027a16421fa1302f37b718dc28e 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -39,6 +39,7 @@ int CHAMELEON_zgeadd(cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alp int CHAMELEON_zgelqf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT); int CHAMELEON_zgelqs(int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); int CHAMELEON_zgels(cham_trans_t trans, int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); +double CHAMELEON_zgenm2( double tol, int M, int N, CHAMELEON_Complex64_t *A, int LDA ); int CHAMELEON_zgemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC); int CHAMELEON_zgeqrf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT); int CHAMELEON_zgeqrs(int M, int N, int NRHS, CHAMELEON_Complex64_t *A, int LDA, CHAM_desc_t *descT, CHAMELEON_Complex64_t *B, int LDB); @@ -114,6 +115,7 @@ int CHAMELEON_zgeadd_Tile(cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_ int CHAMELEON_zgelqf_Tile(CHAM_desc_t *A, CHAM_desc_t *T); int CHAMELEON_zgelqs_Tile(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); int CHAMELEON_zgels_Tile(cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); +double CHAMELEON_zgenm2_Tile( double tol, CHAM_desc_t *A ); int CHAMELEON_zgemm_Tile(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C); int CHAMELEON_zgeqrf_Tile(CHAM_desc_t *A, CHAM_desc_t *T); int CHAMELEON_zgeqrs_Tile(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B); @@ -186,6 +188,7 @@ int CHAMELEON_zgeadd_Tile_Async(cham_trans_t trans, CHAMELEON_Complex64_t alpha, int CHAMELEON_zgelqf_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgelqs_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgels_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zgenm2_Tile_Async( double tol, CHAM_desc_t *A, double *value, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zgemm_Tile_Async(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgeqrf_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgeqrs_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 962e7270f16e9a512b99167c2dc2416f131d4f7a..fce41282c595025d2cf8648c154422c91679f178 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -105,7 +105,7 @@ typedef enum chameleon_side_e { typedef enum chameleon_normtype_e { ChamOneNorm = 171, /**< One norm: max_j( sum_i( |a_{ij}| ) ) */ /* ChamRealOneNorm = 172, */ - /* ChamTwoNorm = 173, */ + ChamTwoNorm = 173, /**< Two norm: max( sigma_i ) */ ChamFrobeniusNorm = 174, /**< Frobenius norm: sqrt( sum_{i,j} (a_{ij}^2) ) */ ChamInfNorm = 175, /**< Inifinite norm: max_i( sum_j( |a_{ij}| ) ) */ /* ChamRealInfNorm = 176, */ diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 6df71da4c29ef3c015f39feeace6e403738b064b..a78d30041de15dd80ce8f83d84b78395d007d538 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -30,6 +30,10 @@ /** * Declarations of QUARK wrappers (called by CHAMELEON) - alphabetical order */ +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_dzasum( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAM_desc_t *A, int Am, int An, @@ -49,6 +53,11 @@ void INSERT_TASK_zgelqt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *T, int Tm, int Tn ); +void INSERT_TASK_zgemv( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *X, int Xm, int Xn, int incX, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *Y, int Ym, int Yn, int incY ); void INSERT_TASK_zgemm( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, diff --git a/include/chameleon/types.h b/include/chameleon/types.h index 1cbedc7bae8fde7f8a0cf2d5bf989df47e6a558a..6f0acad809320d25d44e910caf3cdfc568a941ad 100644 --- a/include/chameleon/types.h +++ b/include/chameleon/types.h @@ -22,6 +22,7 @@ #define _chameleon_types_h_ #include "chameleon/config.h" +#include <math.h> /** * System requirements diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 71724813e89887a857b5d54aa5f8c82146097361..1caa32fef3ec97692aa023fc59547cc44a6f2f61 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -28,12 +28,17 @@ # List of codelets required by all runtimes # ----------------------------------------- set(CODELETS_ZSRC + codelets/codelet_dlag2z.c codelets/codelet_dzasum.c ################## # BLAS 1 ################## codelets/codelet_zaxpy.c ################## + # BLAS 2 + ################## + codelets/codelet_zgemv.c + ################## # BLAS 3 ################## codelets/codelet_zgemm.c diff --git a/runtime/openmp/codelets/codelet_dlag2z.c b/runtime/openmp/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..077f541fef83a22f7e4d1b163002475df254db3a --- /dev/null +++ b/runtime/openmp/codelets/codelet_dlag2z.c @@ -0,0 +1,34 @@ +/** + * + * @file openmp/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z OpenMP codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + +#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( out:tileB[0] ) + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} diff --git a/runtime/openmp/codelets/codelet_zgemv.c b/runtime/openmp/codelets/codelet_zgemv.c new file mode 100644 index 0000000000000000000000000000000000000000..c3d70b0b4ab523994c7c946d791fb45efba115d3 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgemv.c @@ -0,0 +1,37 @@ +/** + * + * @file openmp/codelet_zgemv.c + * + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemv OpenMP codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void +INSERT_TASK_zgemv( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *X, int Xm, int Xn, int incX, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *Y, int Ym, int Yn, int incY ) +{ + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileX = X->get_blktile( X, Xm, Xn ); + CHAM_tile_t *tileY = Y->get_blktile( Y, Ym, Yn ); + +#pragma omp task firstprivate( trans, m, n, alpha, tileA, tileX, incX, beta, tileY, incY ) depend( in:tileA[0], tileX[0] ) depend( inout:tileY[0] ) + TCORE_zgemv( trans, m, n, + alpha, tileA, tileX, incX, + beta, tileY, incY ); +} diff --git a/runtime/parsec/codelets/codelet_dlag2z.c b/runtime/parsec/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..60868494412940b0d2397daa7c544756c941e4bc --- /dev/null +++ b/runtime/parsec/codelets/codelet_dlag2z.c @@ -0,0 +1,62 @@ +/** + * + * @file parsec/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z PaRSEC codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_dlag2z_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int m; + int n; + double *A; + int lda; + CHAMELEON_Complex64_t *B; + int ldb; + + parsec_dtd_unpack_args( this_task, &uplo, &m, &n, &A, &lda, &B, &ldb ); + CORE_dlag2z( uplo, m, n, A, lda, B, ldb ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_dlag2z_parsec, options->priority, "dlag2z", + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT, + sizeof(int), &(tileB->ld), VALUE, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/parsec/codelets/codelet_zgemv.c b/runtime/parsec/codelets/codelet_zgemv.c new file mode 100644 index 0000000000000000000000000000000000000000..0579743d54cdce1cf66ca6c8bbcc40e6fcca9ccc --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgemv.c @@ -0,0 +1,76 @@ +/** + * + * @file parsec/codelet_zgemv.c + * + * @copyright 2009-2015 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemv PaRSEC codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_z.h" + +static inline int +CORE_zgemv_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_trans_t trans; + int m; + int n; + CHAMELEON_Complex64_t alpha; + CHAMELEON_Complex64_t *A; + int lda; + CHAMELEON_Complex64_t *X; + int incX; + CHAMELEON_Complex64_t beta; + CHAMELEON_Complex64_t *Y; + int incY; + + parsec_dtd_unpack_args( + this_task, &trans, &m, &n, &alpha, &A, &lda, &X, &incX, &beta, &Y, &incY ); + + CORE_zgemv( trans, m, n, + alpha, A, lda, + X, incX, + beta, Y, incY ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void +INSERT_TASK_zgemv( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *X, int Xm, int Xn, int incX, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *Y, int Ym, int Yn, int incY ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zgemv_parsec, options->priority, "zgemv", + sizeof(cham_trans_t), &trans, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( X, CHAMELEON_Complex64_t, Xm, Xn ), chameleon_parsec_get_arena_index( X ) | INPUT, + sizeof(int), &incX, VALUE, + sizeof(CHAMELEON_Complex64_t), &beta, VALUE, + PASSED_BY_REF, RTBLKADDR( Y, CHAMELEON_Complex64_t, Ym, Yn ), chameleon_parsec_get_arena_index( Y ) | INOUT | AFFINITY, + sizeof(int), &incY, VALUE, + PARSEC_DTD_ARG_END ); +} diff --git a/runtime/quark/codelets/codelet_dlag2z.c b/runtime/quark/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..86e214347b028782f7aee4cda9dc426a78fe1b3c --- /dev/null +++ b/runtime/quark/codelets/codelet_dlag2z.c @@ -0,0 +1,53 @@ +/** + * + * @file quark/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z Quark codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +static inline void +CORE_dlag2z_quark( Quark *quark ) +{ + cham_uplo_t uplo; + int m; + int n; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + quark_unpack_args_5( quark, uplo, m, n, tileA, tileB ); + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} + +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + + //DAG_CORE_DLAG2Z; + QUARK_Insert_Task( + opt->quark, CORE_dlag2z_quark, (Quark_Task_Flags*)opt, + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(void*), RTBLKADDR(A, double, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, + 0); +} diff --git a/runtime/quark/codelets/codelet_zgemv.c b/runtime/quark/codelets/codelet_zgemv.c new file mode 100644 index 0000000000000000000000000000000000000000..4dcde9bf5a5bdb1be9da1592e80ac576b528ca2a --- /dev/null +++ b/runtime/quark/codelets/codelet_zgemv.c @@ -0,0 +1,66 @@ +/** + * + * @file quark/codelet_zgemv.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemv Quark codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" + +void CORE_zgemv_quark(Quark *quark) +{ + cham_trans_t trans; + int m; + int n; + CHAMELEON_Complex64_t alpha; + CHAM_tile_t *tileA; + CHAM_tile_t *tileX; + int incX; + CHAMELEON_Complex64_t beta; + CHAM_tile_t *tileY; + int incY; + + quark_unpack_args_10( quark, trans, m, n, alpha, tileA, tileX, incX, beta, tileY, incY ); + TCORE_zgemv( trans, m, n, + alpha, tileA, tileX, incX, + beta, tileY, incY ); +} + +void INSERT_TASK_zgemv( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *X, int Xm, int Xn, int incX, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *Y, int Ym, int Yn, int incY ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int accessY = ( beta == 0. ) ? OUTPUT : INOUT; + + /* DAG_CORE_GEMV; */ + QUARK_Insert_Task( + opt->quark, CORE_zgemv_quark, (Quark_Task_Flags*)opt, + sizeof(cham_trans_t), &trans, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(X, CHAMELEON_Complex64_t, Xm, Xn), INPUT, + sizeof(int), &incX, VALUE, + sizeof(CHAMELEON_Complex64_t), &beta, VALUE, + sizeof(void*), RTBLKADDR(Y, CHAMELEON_Complex64_t, Ym, Yn), accessY, + sizeof(int), &incY, VALUE, + 0); +} diff --git a/runtime/starpu/codelets/codelet_dlag2z.c b/runtime/starpu/codelets/codelet_dlag2z.c new file mode 100644 index 0000000000000000000000000000000000000000..b7953cd977431f2b0599d5470ce4fbd6d7834ccb --- /dev/null +++ b/runtime/starpu/codelets/codelet_dlag2z.c @@ -0,0 +1,79 @@ +/** + * + * @file starpu/codelet_dlag2z.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2z StarPU codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_dlag2z_cpu_func(void *descr[], void *cl_arg) +{ + cham_uplo_t uplo; + int m; + int n; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n); + TCORE_dlag2z( uplo, m, n, tileA, tileB ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(dlag2z, cl_dlag2z_cpu_func) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + struct starpu_codelet *codelet = &cl_dlag2z; + void (*callback)(void*) = options->profiling ? cl_dlag2z_callback : NULL; + starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); + int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &uplo, sizeof(uplo), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, double, Am, An), + STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2z", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index f810ab0f07d92c320459072022496d68906f1c30..cb381cfc153f21234fbc7b3fbea21f5663669627 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -22,11 +22,15 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" +#if defined(PRECISION_z) || defined(PRECISION_c) +CHAMELEON_CL_CB(dlag2z, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), 0, M*N) +#endif CHAMELEON_CL_CB(dzasum, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zaxpy, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[1]), 0, M) CHAMELEON_CL_CB(zgeadd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlascal, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zgelqt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (4./3.)*M*N*K) +CHAMELEON_CL_CB(zgemv, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 2. *M*N ) CHAMELEON_CL_CB(zgemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ CHAMELEON_CL_CB(zgeqrt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (4./3.)*M*M*N) CHAMELEON_CL_CB(zgessm, cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]), 2. *M*N*K) diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c new file mode 100644 index 0000000000000000000000000000000000000000..62b887247e58661abda06fcd9dc63238fe1e4c9d --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgemv.c @@ -0,0 +1,133 @@ +/** + * + * @file starpu/codelet_zgemv.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemv StarPU codelet + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2020-10-06 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgemv_cpu_func(void *descr[], void *cl_arg) +{ + cham_trans_t trans; + int m; + int n; + CHAMELEON_Complex64_t alpha; + CHAM_tile_t *tileA; + CHAM_tile_t *tileX; + int incX; + CHAMELEON_Complex64_t beta; + CHAM_tile_t *tileY; + int incY; + + tileA = cti_interface_get(descr[0]); + tileX = cti_interface_get(descr[1]); + tileY = cti_interface_get(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &trans, &m, &n, &alpha, &incX, &beta, &incY ); + TCORE_zgemv( trans, m, n, + alpha, tileA, tileX, incX, + beta, tileY, incY ); +} + +#if defined(CHAMELEON_USE_CUDA) & 0 +static void cl_zgemv_cuda_func(void *descr[], void *cl_arg) +{ + cham_trans_t transA; + cham_trans_t transB; + int m; + int n; + int k; + cuDoubleComplex alpha; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + cuDoubleComplex beta; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); + + RUNTIME_getStream( stream ); + + CUDA_zgemv( + transA, transB, + m, n, k, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, + stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgemv, cl_zgemv_cpu_func) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_zgemv( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *X, int Xm, int Xn, int incX, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *Y, int Ym, int Yn, int incY ) +{ + struct starpu_codelet *codelet = &cl_zgemv; + void (*callback)(void*) = options->profiling ? cl_zgemv_callback : NULL; + starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); + int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + int accessY = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_R(X, Xm, Xn); + CHAMELEON_ACCESS_RW(Y, Ym, Yn); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &trans, sizeof(cham_trans_t), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(X, CHAMELEON_Complex64_t, Xm, Xn), + STARPU_VALUE, &incX, sizeof(int), + STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), + accessY, RTBLKADDR(Y, CHAMELEON_Complex64_t, Ym, Yn), + STARPU_VALUE, &incY, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgemv", +#endif + 0); +} diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 25a14085fb7579fd40f8bb1800b3029d98cf548d..e5ed6704c090d8efc3409e57c2f29ebbb8ac8455 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -39,6 +39,11 @@ */ CODELETS_HEADER(zaxpy); +/* + * BLAS 2 functions + */ +CODELETS_HEADER(zgemv); + /* * BLAS 3 functions */ @@ -106,6 +111,9 @@ CODELETS_HEADER(zlag2c); /* * DZ functions */ +#if defined(PRECISION_z) || defined(PRECISION_c) +CODELETS_HEADER(dlag2z); +#endif CODELETS_HEADER(dzasum); /* diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index f6591433f46729ac378b23e979cb3d9ba2feb8c6..a233e6d909f773a50ce159e6990ed8b0516fb69c 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -74,6 +74,7 @@ set(ZSRC testing_zsytrf.c testing_zsytrs.c testing_zsysv.c + testing_zgenm2.c testing_zgetrf.c testing_zgetrs.c testing_zgesv.c diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake index 986fd4faf5974bd2be4e62e8c80f632465f266af..82bef26df95796b6a2c9aa2708126666adf0d362 100644 --- a/testing/CTestLists.cmake +++ b/testing/CTestLists.cmake @@ -64,6 +64,8 @@ if (NOT CHAMELEON_SIMULATION) #geqrs_hqr gelqs_hqr gels gels_hqr ) + set( TESTS ${TESTS} + genm2 ) foreach(cat ${TEST_CATEGORIES}) foreach(gpus ${N_GPUS}) diff --git a/testing/chameleon_ztesting.c b/testing/chameleon_ztesting.c index 0d6d6e5d93d4ff1c1251f1994ad07ffcb04e28d3..82e3969202896be9c23196fc722c83c6ea182c24 100644 --- a/testing/chameleon_ztesting.c +++ b/testing/chameleon_ztesting.c @@ -69,6 +69,11 @@ static parameter_t parameters[] = { { "seedB", "Seed for the matrix B random generation", 'Y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 11, TestValInt, {0}, NULL, pread_int, sprint_int }, { "seedC", "Seed for the matrix C random generation", 'Z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 11, TestValInt, {0}, NULL, pread_int, sprint_int }, + { NULL, "Matrix generation numerical parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, + { "bump", "Bump value to make a matrix diagonal dominant", 'z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "mode", "Mode that specifies the eigen/singular values in xlatms", -30, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "cond", "Conditional number of the matrix used by xlatms", -31, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValDouble, {0}, NULL, pread_double, sprint_double }, + { NULL, "Operation specific parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "trans", "Value of the trans parameter", -11, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, { "transA", "Value of the transA parameter", -12, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 9, TestTrans, {0}, NULL, pread_trans, sprint_trans }, @@ -79,9 +84,8 @@ static parameter_t parameters[] = { { "norm", "Value of the norm parameter", -17, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 4, TestNormtype, {0}, NULL, pread_norm, sprint_norm }, { NULL, "Operation specific scalar", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, - { "alpha", "Value of the scalar alpha", 'x', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, - { "beta", "Value of the scalar beta", 'y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, - { "bump", "Bump value to make a matrix diagonal dominant", 'z', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "alpha", "Value of the scalar alpha", 'x', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, + { "beta", "Value of the scalar beta", 'y', PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 13, TestValComplex64, {0}, NULL, pread_complex64, sprint_complex64 }, { NULL, "QR/LQ parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "qra", "Size of TS domain (=RH for householder trees)", -20, PARAM_OPTION | PARAM_INPUT | PARAM_OUTPUT, 2, 3, TestValInt, {0}, NULL, pread_int, sprint_int }, diff --git a/testing/input/genm2.in b/testing/input/genm2.in new file mode 100644 index 0000000000000000000000000000000000000000..dc29aa220c8af8dc0bb4a4e0b8e31c05d298f395 --- /dev/null +++ b/testing/input/genm2.in @@ -0,0 +1,20 @@ +# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step] +# Not given parameters will receive default values + +# GENM2 +# mtxfmt +# nb: Tile size +# M: Number of rows of matrix A +# N: Number of columns of matrix A +# LDA: Leading dimension of matrix A +# cond: The condition number +# mode: the mode values for latms + +op = genm2 +mtxfmt = 0, 1 +nb = 16, 17 +m = 15, 25, 37 +n = 13, 23, 35 +lda = 41 +cond = 1., 1.e13 +mode = 1:6 diff --git a/testing/testing_zcheck.c b/testing/testing_zcheck.c index 65b0ec3dddb7631085af86949e48840b237135a2..9263fe639f1139acbe945d6433846d276556ca4a 100644 --- a/testing/testing_zcheck.c +++ b/testing/testing_zcheck.c @@ -214,9 +214,6 @@ int check_znorm( run_arg_list_t *args, cham_mtxtype_t matrix_type, cham_normtype result = fabs( norm_cham - norm_lapack ) / ( norm_lapack * eps ); switch(norm_type) { - case ChamMaxNorm: - /* result should be perfectly equal */ - break; case ChamInfNorm: /* Sum order on the line can differ */ result = result / (double)N; @@ -229,6 +226,10 @@ int check_znorm( run_arg_list_t *args, cham_mtxtype_t matrix_type, cham_normtype /* Sum order on every element can differ */ result = result / ((double)M * (double)N); break; + case ChamMaxNorm: + default: + /* result should be perfectly equal */ + ; } info_solution = ( result < 1 ) ? 0 : 1; diff --git a/testing/testing_zgenm2.c b/testing/testing_zgenm2.c new file mode 100644 index 0000000000000000000000000000000000000000..b396d9fee6ee2a4c51542b3bc2a80102b9d72710 --- /dev/null +++ b/testing/testing_zgenm2.c @@ -0,0 +1,140 @@ +/** + * + * @file testing_zgenm2.c + * + * @copyright 2019-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgenm2 testing + * + * @version 1.0.0 + * @author Lucas Barros de Assis + * @author Mathieu Faverge + * @date 2020-10-05 + * @precisions normal z -> c d s + * + */ +#include <chameleon.h> +#if !defined(CHAMELEON_SIMULATION) +#include <coreblas/lapacke.h> +#include <coreblas/cblas.h> +#include <coreblas.h> +#endif +#include "testings.h" +#include "testing_zcheck.h" +#include "flops.h" + +static cham_fixdbl_t +flops_zgenm2( int M, int N ) +{ + double coefabs = 1.; +#if defined( PRECISION_z ) || defined( PRECISION_c ) + coefabs = 3.; +#endif + + return coefabs * M * N; +} + +int +testing_zgenm2( run_arg_list_t *args, int check ) +{ + int hres = 0; + double norm; + CHAM_desc_t *descA; + double *D, dmax = 1.; + + /* Reads arguments */ + intptr_t mtxfmt = parameters_getvalue_int( "mtxfmt" ); + int nb = run_arg_get_int( args, "nb", 320 ); + int P = parameters_getvalue_int( "P" ); + int N = run_arg_get_int( args, "N", 1000 ); + int M = run_arg_get_int( args, "M", N ); + int LDA = run_arg_get_int( args, "LDA", M ); + int seedA = run_arg_get_int( args, "seedA", random() ); + int Q = parameters_compute_q( P ); + int minMN = chameleon_min( M, N ); + double cond = run_arg_get_double( args, "cond", 1.e16 ); + int mode = run_arg_get_int( args, "mode", 4 ); + double tol = 1.e-1; + cham_fixdbl_t t, gflops; + cham_fixdbl_t flops = flops_zgenm2( M, N ); + + CHAMELEON_Set( CHAMELEON_TILE_SIZE, nb ); + + /* Generate the diagonal of eigen/singular values */ + D = malloc( minMN * sizeof(double) ); +#if !defined(CHAMELEON_SIMULATION) + hres = CORE_dlatm1( mode, cond, 0, ChamDistUniform, seedA, D, minMN ); + if ( hres != 0 ) { + free( D ); + return hres; + } + + /* Save the largest absolute value */ + hres = cblas_idamax( minMN, D, 1 ); + dmax = fabs( D[hres] ); +#endif + + /* Creates the matrix */ + CHAMELEON_Desc_Create( + &descA, (void*)(-mtxfmt), ChamComplexDouble, nb, nb, nb * nb, LDA, N, 0, 0, M, N, P, Q ); + + /* Fills the matrix with random values */ + hres = CHAMELEON_zlatms_Tile( + ChamDistUniform, seedA, ChamNonsymPosv, D, 0, cond, 0., descA ); + free( D ); + if ( hres != 0 ) { + return hres; + } + + /* Calculates the norm */ + START_TIMING( t ); + norm = CHAMELEON_zgenm2_Tile( tol, descA ); + STOP_TIMING( t ); + gflops = flops * 1.e-9 / t; + run_arg_add_fixdbl( args, "time", t ); + run_arg_add_fixdbl( args, "gflops", ( norm >= 0. ) ? gflops : -1. ); + + /* Checks the solution */ + hres = 0; + if ( check ) { + double res = fabs(dmax - norm) / (dmax * tol); + + run_arg_add_double( args, "||A||", dmax ); + run_arg_add_double( args, "||B||", norm ); + run_arg_add_double( args, "||R||", res ); + + if ( isnan(res) || isinf(res) || (res > 10.0) ) { + hres = 1; + } + } + + CHAMELEON_Desc_Destroy( &descA ); + + return hres; +} + +testing_t test_zgenm2; +const char *zgenm2_params[] = { "mtxfmt", "nb", "m", "n", "lda", "seedA", "cond", "mode", NULL }; +const char *zgenm2_output[] = { NULL }; +const char *zgenm2_outchk[] = { "||A||", "||B||", "||R||", "RETURN", NULL }; + +/** + * @brief Testing registration function + */ +void testing_zgenm2_init( void ) __attribute__( ( constructor ) ); +void +testing_zgenm2_init( void ) +{ + test_zgenm2.name = "zgenm2"; + test_zgenm2.helper = "General matrix two-norm estimator"; + test_zgenm2.params = zgenm2_params; + test_zgenm2.output = zgenm2_output; + test_zgenm2.outchk = zgenm2_outchk; + test_zgenm2.fptr = testing_zgenm2; + test_zgenm2.next = NULL; + + testing_register( &test_zgenm2 ); +}