diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f897fcdd6718df2df7fb1caef63200e90c20644..fd7753a578be36e5de3e01ee56c77e9759605d27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,7 @@ set(SOURCES src/z_spm_dof_extend.c src/z_spm_norm.c src/z_spm_scal.c + src/z_spm_gather_rhs.c src/z_spm_reduce_rhs.c src/z_spm_convert_to_csc.c src/z_spm_convert_to_csr.c diff --git a/include/spm.h b/include/spm.h index 0f46e18fe5ad5b21c348a7b901521eacc578eb53..ed42fa9f644dbc1beb2a09a63c03262e7d7790c9 100644 --- a/include/spm.h +++ b/include/spm.h @@ -207,7 +207,7 @@ int spmParseLaplacianInfo( const char * filename, */ void * spm2Dense( const spmatrix_t *spm ); void spmPrint( const spmatrix_t *spm, FILE *f ); -void spmPrintRHS( const spmatrix_t *spm, int n, const void *x, spm_int_t ldx, FILE *stream ); +void spmPrintRHS( const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx, FILE *stream ); void spmPrintInfo( const spmatrix_t *spm, FILE *f ); void spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out ); spmatrix_t *spmDofExtend( const spmatrix_t *spm, const int type, const int dof ); diff --git a/include/spm/const.h b/include/spm/const.h index 01f9de534e2cc089ad918008cc6f264b52115e4b..16815f39fef98ac83f366cd84d19d300b6b69897 100644 --- a/include/spm/const.h +++ b/include/spm/const.h @@ -32,6 +32,12 @@ BEGIN_C_DECLS #define CBLAS_SADDR( a_ ) (&(a_)) #endif +/** + * @brief Distribution of the matrix storage + */ +#define SpmDistByColumn (0x1 << 0) /**< Storage in column distributed */ +#define SpmDistByRow (0x1 << 1) /**< Storage in row distributed */ + /** * @brief Verbose modes */ diff --git a/src/common.h b/src/common.h index 4abadf79404831a5b937fe2bdc12b88c20345196..bce115b64f362c01230a373e47fc5caab30c30f3 100644 --- a/src/common.h +++ b/src/common.h @@ -54,6 +54,7 @@ spm_get_datatype( const spmatrix_t *spm ) #endif spm_int_t *spm_get_glob2loc( spmatrix_t *spm, spm_int_t baseval ); +int spm_get_distribution( const spmatrix_t *spm ); /******************************************************************** * Conjuguate/Id functions diff --git a/src/spm.c b/src/spm.c index b492e714e77512bd4855c37cc02ef549288872b8..b42538caa3fe60be2fe64bfe19f4876f18cada57 100644 --- a/src/spm.c +++ b/src/spm.c @@ -848,6 +848,10 @@ spmCopy( const spmatrix_t *spm ) newspm->loc2glob = (spm_int_t*)malloc( spm->n * sizeof(spm_int_t) ); memcpy( newspm->loc2glob, spm->loc2glob, spm->n * sizeof(spm_int_t) ); } + if(spm->glob2loc != NULL) { + newspm->glob2loc = (spm_int_t*)malloc( spm->gN * sizeof(spm_int_t) ); + memcpy( newspm->glob2loc, spm->glob2loc, spm->gN * sizeof(spm_int_t) ); + } if(spm->dofs != NULL) { newspm->dofs = (spm_int_t*)malloc( dofsize * sizeof(spm_int_t) ); memcpy( newspm->dofs, spm->dofs, dofsize * sizeof(spm_int_t) ); @@ -1018,7 +1022,7 @@ spmPrint( const spmatrix_t *spm, *******************************************************************************/ void spmPrintRHS( const spmatrix_t *spm, - int n, + int nrhs, const void *x, spm_int_t ldx, FILE *stream ) @@ -1033,16 +1037,16 @@ spmPrintRHS( const spmatrix_t *spm, /* Not handled for now */ break; case SpmFloat: - s_spmPrintRHS( stream, spm, n, x, ldx ); + s_spmPrintRHS( stream, spm, nrhs, x, ldx ); break; case SpmComplex32: - c_spmPrintRHS( stream, spm, n, x, ldx ); + c_spmPrintRHS( stream, spm, nrhs, x, ldx ); break; case SpmComplex64: - z_spmPrintRHS( stream, spm, n, x, ldx ); + z_spmPrintRHS( stream, spm, nrhs, x, ldx ); break; case SpmDouble: - d_spmPrintRHS( stream, spm, n, x, ldx ); + d_spmPrintRHS( stream, spm, nrhs, x, ldx ); } return; @@ -1240,10 +1244,6 @@ spmMatMat( spm_trans_t trans, spmatrix_t *espm = (spmatrix_t*)A; int rc = SPM_SUCCESS; - if ( A->dof != 1 ) { - espm = malloc( sizeof(spmatrix_t) ); - spmExpand( A, espm ); - } switch (A->flttype) { case SpmFloat: rc = spm_sspmm( SpmLeft, trans, SpmNoTrans, n, alpha, espm, B, ldb, beta, C, ldc ); @@ -1614,3 +1614,76 @@ spm_get_glob2loc( spmatrix_t *spm, (void) baseval; return spm->glob2loc; } + +/** + ******************************************************************************* + * + * @ingroup spm_mpi_dev + * + * @brief Search the distribution pattern used in the spm structure. + * + ******************************************************************************* + * + * @param[in] spm + * The sparse matrix structure. + * + ******************************************************************************** + * + * @return SpmDistByColumn if the distribution is column based. + * SpmDistByRow if the distribution is row based. + * (SpmDistByColumn|SpmDistByRow) if the matrix is not distributed. + * + *******************************************************************************/ +int +spm_get_distribution( const spmatrix_t *spm ) +{ + int distribution = 0; + + if( (spm->loc2glob == NULL) || (spm->n == spm->gN) ) { + distribution = ( SpmDistByColumn | SpmDistByRow ); + } + else { + if( spm->fmttype == SpmCSC ){ + distribution = SpmDistByColumn; + } + else if ( spm->fmttype == SpmCSR ) { + distribution = SpmDistByRow; + } + else { + spm_int_t i, baseval; + spm_int_t *colptr = spm->colptr; + spm_int_t *glob2loc = spm->glob2loc; + + baseval = spmFindBase( spm ); + distribution = 1; + assert( glob2loc != NULL ); + for ( i = 0; i < spm->nnz; i++, colptr++ ) + { + /* + * If the global index is not in the local colptr + * -> row distribution + */ + if( glob2loc[ *colptr - baseval ] < 0 ) { + distribution = SpmDistByRow; + break; + } + } + + #if defined(SPM_WITH_MPI) + { + int check = 0; + MPI_Allreduce( &distribution, &check, 1, MPI_INT, + MPI_BOR, spm->comm ); + /* + * If a matrix is distributed + * it cannot be distributed by row AND column + */ + assert( check != ( SpmDistByColumn | SpmDistByRow ) ); + assert( distribution == check ); + } + #endif + } + } + assert(distribution > 0); + return distribution; +} diff --git a/src/z_spm.h b/src/z_spm.h index 4fbe1b06122721419c0bff646b3df1ccce6f075b..cb2089c27e16b63d466b673de7d26f851aa0f166 100644 --- a/src/z_spm.h +++ b/src/z_spm.h @@ -75,17 +75,28 @@ spm_int_t z_spmSymmetrize( spmatrix_t *spm ); int z_spmGenRHS(spm_rhstype_t type, int nrhs, const spmatrix_t *spm, void *x, int ldx, void *b, int ldb ); int z_spmCheckAxb( spm_fixdbl_t eps, int nrhs, const spmatrix_t *spm, void *x0, int ldx0, void *b, int ldb, const void *x, int ldx ); -spm_complex64_t *z_spmReduceRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root ); +spm_complex64_t *z_spmGatherRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root ); +void z_spmReduceRHS( const spmatrix_t *spm, int nrhs, spm_complex64_t *bglob, spm_int_t ldbglob, spm_complex64_t *b, spm_int_t ldb ); /** * Output routines */ void z_spmDensePrint( FILE *f, spm_int_t m, spm_int_t n, const spm_complex64_t *A, spm_int_t lda ); void z_spmPrint( FILE *f, const spmatrix_t *spm ); -void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int n, const void *x, spm_int_t ldx ); +void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx ); void z_spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out ); void z_spmDofExtend( spmatrix_t *spm ); void z_spmScal( const double alpha, spmatrix_t *spm ); +/** + * Subroutines for random vector generation to be used in testings + */ +void z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale, + spm_int_t n, spm_complex64_t *A, spm_int_t lda, + int shift, unsigned long long int seed ); +void z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale, + spm_int_t n, spm_complex64_t *A, spm_int_t lda, + int shift, unsigned long long int seed ); + #endif /* _z_spm_h_ */ diff --git a/src/z_spm_convert_to_csc.c b/src/z_spm_convert_to_csc.c index 9ea0d28eeb401afd4798df3d5af822c289949daf..d4efbc3f1cd0a16be77e683010234cf5201a03c4 100644 --- a/src/z_spm_convert_to_csc.c +++ b/src/z_spm_convert_to_csc.c @@ -65,30 +65,13 @@ z_spmConvertIJV2CSC( spmatrix_t *spm ) #if defined(SPM_WITH_MPI) if ( spm->loc2glob != NULL ) { - /* - * Check if the distribution is by column or row by exploiting the fact - * that the array is sorted. - * This is not completely safe, but that avoids going through the full - * matrix. - */ const spm_int_t *glob2loc; - spm_int_t m = spm->rowptr[spm->nnz-1] - spm->rowptr[0] + 1; /* This may be not correct */ - spm_int_t n = spm->colptr[spm->nnz-1] - spm->colptr[0] + 1; spm_int_t jg; - int distribution = 0; + int distribution = spm_get_distribution( spm ); - if ( m <= spm->n ) { /* By row */ - distribution |= 1; - } - if ( n <= spm->n ) { /* By column */ - distribution |= 2; - } - MPI_Allreduce( MPI_IN_PLACE, &distribution, 1, MPI_INT, - MPI_BAND, spm->comm ); - - if ( !(distribution & 2) ) { - //fprintf( stderr, "spmConvert: Conversion of column distributed matrices to CSC is not yet implemented\n"); - return SPM_ERR_NOTIMPLEMENTED; + if ( !(distribution & SpmDistByColumn) ) { + fprintf( stderr, "spmConvert: Conversion of non column distributed matrices to CSC is not yet implemented\n"); + return SPM_ERR_BADPARAMETER; } /* Allocate and compute the glob2loc array */ diff --git a/src/z_spm_convert_to_csr.c b/src/z_spm_convert_to_csr.c index 1f5f390c5d109bd6ce91217778589d6e89da8b14..3d7a247f104ccf4db72a769cc135c7cfdd663db0 100644 --- a/src/z_spm_convert_to_csr.c +++ b/src/z_spm_convert_to_csr.c @@ -70,29 +70,12 @@ z_spmConvertIJV2CSR( spmatrix_t *spm ) #if defined(SPM_WITH_MPI) if ( spm->loc2glob != NULL ) { - /* - * Check if the distribution is by column or row by exploiting the fact - * that the array is sorted. - * This is not completely safe, but that avoids going through the full - * matrix. - */ const spm_int_t *glob2loc; - spm_int_t m = spm->rowptr[spm->nnz-1] - spm->rowptr[0] + 1; /* This may be not correct */ - spm_int_t n = spm->colptr[spm->nnz-1] - spm->colptr[0] + 1; spm_int_t ig; - int distribution = 0; + int distribution = spm_get_distribution( spm ); - if ( m <= spm->n ) { /* By row */ - distribution |= 1; - } - if ( n <= spm->n ) { /* By column */ - distribution |= 2; - } - MPI_Allreduce( MPI_IN_PLACE, &distribution, 1, MPI_INT, - MPI_BAND, spm->comm ); - - if ( !(distribution & 1) ) { - //fprintf( stderr, "spmConvert: Conversion of column distributed matrices to CSC is not yet implemented\n"); + if ( !(distribution & SpmDistByRow) ) { + fprintf( stderr, "spmConvert: Conversion of non row distributed matrices to CSR is not yet implemented\n"); return SPM_ERR_NOTIMPLEMENTED; } diff --git a/src/z_spm_gather_rhs.c b/src/z_spm_gather_rhs.c new file mode 100644 index 0000000000000000000000000000000000000000..ca4b44b1c12116e6d6645a2abf45fc75106a494e --- /dev/null +++ b/src/z_spm_gather_rhs.c @@ -0,0 +1,166 @@ +/** + * + * @file z_spm_gather_rhs.c + * + * SParse Matrix package right hand side gather routine. + * + * @copyright 2020-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + * @version 1.0.0 + * @author Delarue Tony + * @author Faverge Mathieu + * @date 2020-06-11 + * + * @precisions normal z -> c d s + * + **/ +#include "common.h" +#include "z_spm.h" + +/** + ******************************************************************************* + * + * @brief Gather all the global C coefficients and store the good ones in local + * + ******************************************************************************* + * + * @param[in] spm + * The sparse matrix spm + * + * @param[in] nrhs + * Number of rhs vectors. + * + * @param[in] x + * Local vector + * + * @param[in] ldx + * Leading dimension of this vector + * + * @param[in] root + * Clustnum whre the complete vector will be gathered. + * -1 if you want to gather the data on all nodes. + * + ******************************************************************************** + * + * @return The gathered right hand side vector. + * + *******************************************************************************/ +spm_complex64_t * +z_spmGatherRHS( const spmatrix_t *spm, + int nrhs, + const spm_complex64_t *x, + spm_int_t ldx, + int root ) +{ + int local = ( ( root == -1 ) || (root == spm->clustnum) ); + spm_complex64_t *out = NULL; + + /* We do not handle cases where ldx is different from spm->n */ + assert( ldx == spm->nexp ); + + if ( spm->loc2glob == NULL ) { + if ( local ) { + out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); + memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); + } + (void)ldx; + return out; + } + +#if defined(SPM_WITH_MPI) + int c; + int n, nmax = 0; + int nexp, nexpmax = 0; + spm_complex64_t *tmp_out, *current_out, *outptr; + spm_int_t *tmp_l2g, *current_l2g; + spm_int_t current_n, current_nexp; + spm_int_t i, j, k, ig, dofi, row; + spm_int_t baseval = spmFindBase( spm ); + + n = spm->n; + nexp = spm->nexp; + if ( root == -1 ) { + MPI_Allreduce( &n, &nmax, 1, MPI_INT, MPI_MAX, spm->comm ); + MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm ); + } + else { + MPI_Reduce( &n, &nmax, 1, MPI_INT, MPI_MAX, root, spm->comm ); + MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm ); + } + + if ( local ) { + out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); + tmp_out = malloc( nexpmax * nrhs * sizeof( spm_complex64_t ) ); + tmp_l2g = malloc( nmax * sizeof( spm_int_t ) ); + } + + for( c=0; c<spm->clustnbr; c++ ) { + MPI_Status status; + + if ( c == spm->clustnum ) { + current_n = spm->n; + current_nexp = spm->nexp; + current_l2g = spm->loc2glob; + current_out = (spm_complex64_t*)x; + } + else { + current_n = 0; + current_nexp = 0; + current_l2g = tmp_l2g; + current_out = tmp_out; + } + + if ( root == -1 ) { + MPI_Bcast( ¤t_n, 1, SPM_MPI_INT, c, spm->comm ); + MPI_Bcast( ¤t_nexp, 1, SPM_MPI_INT, c, spm->comm ); + if ( current_n > 0 ) { + MPI_Bcast( current_l2g, current_n, SPM_MPI_INT, c, spm->comm ); + MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm ); + } + } + else if ( root != c ) { /* No communication if c == root */ + if ( c == spm->clustnum ) { + MPI_Send( ¤t_n, 1, SPM_MPI_INT, root, 0, spm->comm ); + MPI_Send( ¤t_nexp, 1, SPM_MPI_INT, root, 1, spm->comm ); + if ( current_n > 0 ) { + MPI_Send( current_l2g, current_n, SPM_MPI_INT, root, 2, spm->comm ); + MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm ); + } + } + if ( root == spm->clustnum ) { + MPI_Recv( ¤t_n, 1, SPM_MPI_INT, c, 0, spm->comm, &status ); + MPI_Recv( ¤t_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status ); + if ( current_n > 0 ) { + MPI_Recv( current_l2g, current_n, SPM_MPI_INT, c, 2, spm->comm, &status ); + MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status ); + } + } + } + + if ( !local ) { + continue; + } + + outptr = out; + for( i=0; i<current_n; i++, current_l2g++ ) { + ig = *current_l2g - baseval; + dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig]; + row = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval; + for( j=0; j<nrhs; j++ ) { + for( k=0; k<dofi; k++ ) { + outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ]; + } + } + current_out += dofi; + } + } + + if ( local ) { + free( tmp_out ); + free( tmp_l2g ); + } + +#endif + return out; +} diff --git a/src/z_spm_genrhs.c b/src/z_spm_genrhs.c index f466b8bef72ff76945146d70a3d89cb370954f68..54f213cb71f700a12da9b7ba9d40347d71d01c36 100644 --- a/src/z_spm_genrhs.c +++ b/src/z_spm_genrhs.c @@ -94,7 +94,7 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) { * ******************************************************************************/ static inline void -z_updateRndVal( double scale, +z_updateRndVal( spm_complex64_t scale, spm_complex64_t *val, unsigned long long int *ran ) { @@ -265,7 +265,7 @@ z_spmRhsGenI( const spmatrix_t *spm, spm_int_t baseval, * all tiles initialized with this routine. * ******************************************************************************/ -static inline void +void z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale, spm_int_t n, spm_complex64_t *A, spm_int_t lda, @@ -329,7 +329,7 @@ z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval, ******************************************************************************/ void z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval, - double scale, + spm_complex64_t scale, spm_int_t n, spm_complex64_t *A, spm_int_t lda, int shift, unsigned long long int seed ) { @@ -368,6 +368,7 @@ z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval, } } } + (void)lda; } /** diff --git a/src/z_spm_matrixvector.c b/src/z_spm_matrixvector.c index cf6078b08780acbb26adaf8c770896aa8664fee2..32c766ea94f6014582099a72f8772aeabe5b7333 100644 --- a/src/z_spm_matrixvector.c +++ b/src/z_spm_matrixvector.c @@ -37,15 +37,23 @@ __fct_conj( spm_complex64_t val ) { } #endif +/** + * @brief Store all the data necessary to do a matrix-matrix product + * for all cases. + */ struct __spm_zmatvec_s { int follow_x; - spm_int_t baseval, n, nnz; + spm_int_t baseval, n, nnz, gN; spm_complex64_t alpha; const spm_int_t *rowptr; const spm_int_t *colptr; const spm_complex64_t *values; + const spm_int_t *loc2glob; + + spm_int_t dof; + const spm_int_t *dofs; const spm_complex64_t *x; spm_int_t incx; @@ -58,43 +66,77 @@ struct __spm_zmatvec_s { __loop_fct_t loop_fct; }; -static inline int -__spm_zmatvec_sy_csr( const __spm_zmatvec_t *args ) +/** + * @brief Compute the dof loop for a general block + */ +static inline void +__spm_zmatvec_dof_loop( spm_int_t row, spm_int_t dofi, + spm_int_t col, spm_int_t dofj, + spm_complex64_t *y, spm_int_t incy, + const spm_complex64_t *x, spm_int_t incx, + const spm_complex64_t *values, + const __conj_fct_t conjA_fct, + spm_complex64_t alpha ) { - spm_int_t baseval = args->baseval; - spm_int_t n = args->n; - spm_complex64_t alpha = args->alpha; - const spm_int_t *rowptr = args->rowptr; - const spm_int_t *colptr = args->colptr; - const spm_complex64_t *values = args->values; - const spm_complex64_t *x = args->x; - spm_int_t incx = args->incx; - spm_complex64_t *y = args->y; - spm_int_t incy = args->incy; - const __conj_fct_t conjA_fct = args->conjA_fct; - const __conj_fct_t conjAt_fct = args->conjAt_fct; - spm_int_t col, row, i; + spm_int_t ii, jj; - for( col=0; col<n; col++, colptr++ ) + for(jj=0; jj<dofj; jj++) { - for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ ) + for(ii=0; ii<dofi; ii++, values++) { - row = *rowptr - baseval; + y[ row + (ii * incy) ] += alpha * conjA_fct( *values ) * x[ col +(jj * incx) ]; + } + } +} - if ( row != col ) { - y[ row * incy ] += alpha * conjAt_fct( *values ) * x[ col * incx ]; - y[ col * incy ] += alpha * conjA_fct( *values ) * x[ row * incx ]; - } - else { - y[ col * incy ] += alpha * conjA_fct( *values ) * x[ row * incx ]; - } +/** + * @brief Compute the dof loop for a symmetric off diagonal block + */ +static inline void +__spm_zmatvec_dof_loop_sy( spm_int_t row, spm_int_t dofi, + spm_int_t col, spm_int_t dofj, + spm_complex64_t *y, spm_int_t incy, + const spm_complex64_t *x, spm_int_t incx, + const spm_complex64_t *values, + const __conj_fct_t conjA_fct, + const __conj_fct_t conjAt_fct, + spm_complex64_t alpha ) +{ + spm_int_t ii, jj; + + for(jj=0; jj<dofj; jj++) + { + for(ii=0; ii<dofi; ii++, values++) + { + y[ row + (ii * incy) ] += alpha * conjA_fct( *values ) * x[ col +(jj * incx) ]; + y[ col + (jj * incy) ] += alpha * conjAt_fct( *values ) * x[ row +(ii * incx) ]; } } - return SPM_SUCCESS; } +/** + * @brief Compute the dof loop for a symmetric CSR matrix + * Allow code factorization. + */ +static inline void +__spm_zmatvec_dof_loop_sy_csr( spm_int_t row, spm_int_t dofi, + spm_int_t col, spm_int_t dofj, + spm_complex64_t *y, spm_int_t incy, + const spm_complex64_t *x, spm_int_t incx, + const spm_complex64_t *values, + const __conj_fct_t conjA_fct, + const __conj_fct_t conjAt_fct, + spm_complex64_t alpha ) +{ + return __spm_zmatvec_dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjAt_fct, conjA_fct, alpha ); +} + +/** + * @brief Compute A*x[i:, j] = y[i:, j] + * for a CSX symmetric matrix + */ static inline int -__spm_zmatvec_sy_csc( const __spm_zmatvec_t *args ) +__spm_zmatvec_sy_csx( const __spm_zmatvec_t *args ) { spm_int_t baseval = args->baseval; spm_int_t n = args->n; @@ -102,34 +144,53 @@ __spm_zmatvec_sy_csc( const __spm_zmatvec_t *args ) const spm_int_t *rowptr = args->rowptr; const spm_int_t *colptr = args->colptr; const spm_complex64_t *values = args->values; + const spm_int_t *loc2glob = args->loc2glob; + const spm_int_t *dofs = args->dofs; + spm_int_t dof = args->dof; const spm_complex64_t *x = args->x; spm_int_t incx = args->incx; spm_complex64_t *y = args->y; spm_int_t incy = args->incy; const __conj_fct_t conjA_fct = args->conjA_fct; const __conj_fct_t conjAt_fct = args->conjAt_fct; - spm_int_t col, row, i; + spm_int_t row, col, dofj, dofi; + spm_int_t i, ig, j, jg; - for( col=0; col<n; col++, colptr++ ) + /* If(args->follow_x) -> CSR. We need to change exchange the conj functions in the symmetric dof loop */ + void (*dof_loop_sy)( spm_int_t, spm_int_t, spm_int_t, spm_int_t, + spm_complex64_t *, spm_int_t, + const spm_complex64_t *, spm_int_t, const spm_complex64_t *, + const __conj_fct_t, const __conj_fct_t, spm_complex64_t ) + = ( args->follow_x ) ? __spm_zmatvec_dof_loop_sy_csr : __spm_zmatvec_dof_loop_sy; + + for( j=0; j<n; j++, colptr++ ) { - for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ ) + jg = (loc2glob == NULL) ? j : loc2glob[j] - baseval ; + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval; + for( i=colptr[0]; i<colptr[1]; i++, rowptr++ ) { - row = *rowptr - baseval; - + ig = *rowptr - baseval; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; if ( row != col ) { - y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ]; - y[ col * incy ] += alpha * conjAt_fct( *values ) * x[ row * incx ]; + dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, conjAt_fct, alpha ); } else { - y[ col * incy ] += alpha * conjA_fct( *values ) * x[ row * incx ]; + __spm_zmatvec_dof_loop( col, dofj, row, dofi, y, incy, x, incx, values, conjA_fct, alpha ); } + values += dofi*dofj; } } return SPM_SUCCESS; } +/** + * @brief Compute A*x[i:, j] = y[i:, j] + * for a CSC/CSR general matrix + */ static inline int -__spm_zmatvec_ge_csc( const __spm_zmatvec_t *args ) +__spm_zmatvec_ge_csx( const __spm_zmatvec_t *args ) { spm_int_t baseval = args->baseval; spm_int_t n = args->n; @@ -137,36 +198,56 @@ __spm_zmatvec_ge_csc( const __spm_zmatvec_t *args ) const spm_int_t *rowptr = args->rowptr; const spm_int_t *colptr = args->colptr; const spm_complex64_t *values = args->values; + const spm_int_t *loc2glob = args->loc2glob; + const spm_int_t *dofs = args->dofs; + spm_int_t dof = args->dof; const spm_complex64_t *x = args->x; spm_int_t incx = args->incx; spm_complex64_t *y = args->y; spm_int_t incy = args->incy; const __conj_fct_t conjA_fct = args->conjA_fct; - spm_int_t col, row, i; + spm_int_t row, dofj, dofi; + spm_int_t i, ig, j, jg; if ( args->follow_x ) { - for( col=0; col<n; col++, colptr++, x+=incx ) + for( j = 0; j < n; j++, colptr++ ) { - for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ ) + jg = (loc2glob == NULL) ? j : loc2glob[j] - baseval; + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + for( i=colptr[0]; i<colptr[1]; i++, rowptr++ ) { - row = *rowptr - baseval; - y[ row * incy ] += alpha * conjA_fct( *values ) * (*x); + ig = *rowptr - baseval; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; + __spm_zmatvec_dof_loop( row, dofi, 0, dofj, y, incy, x, 1, values, conjA_fct, alpha ); + values += dofi * dofj; } + x += dofj * incx; } } else { - for( col=0; col<n; col++, colptr++, y+=incy ) + for( j=0; j<n; j++, colptr++ ) { - for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ ) + jg = (loc2glob == NULL) ? j : loc2glob[j] - baseval; + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + for( i=colptr[0]; i<colptr[1]; i++, rowptr++ ) { - row = *rowptr - baseval; - *y += alpha * conjA_fct( *values ) * x[ row * incx ]; + ig = *rowptr - baseval; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; + __spm_zmatvec_dof_loop( 0, dofj, row, dofi, y, 1, x, incx, values, conjA_fct, alpha ); + values += dofi * dofj; } + y += dofj * incy; } } return SPM_SUCCESS; } +/** + * @brief Compute A*x[i:, j] = y[i:, j] + * for a IJV symmetric matrix + */ static inline int __spm_zmatvec_sy_ijv( const __spm_zmatvec_t *args ) { @@ -176,30 +257,68 @@ __spm_zmatvec_sy_ijv( const __spm_zmatvec_t *args ) const spm_int_t *rowptr = args->rowptr; const spm_int_t *colptr = args->colptr; const spm_complex64_t *values = args->values; + const spm_int_t *dofs = args->dofs; + spm_int_t dof = args->dof; const spm_complex64_t *x = args->x; spm_int_t incx = args->incx; spm_complex64_t *y = args->y; spm_int_t incy = args->incy; const __conj_fct_t conjA_fct = args->conjA_fct; const __conj_fct_t conjAt_fct = args->conjAt_fct; - spm_int_t col, row, i; + spm_int_t row, col, dofj, dofi; + spm_int_t i, ig, jg; - for( i=0; i<nnz; i++, colptr++, rowptr++, values++ ) + for( i=0; i<nnz; i++, colptr++, rowptr++ ) { - row = *rowptr - baseval; - col = *colptr - baseval; + ig = *rowptr - baseval; + jg = *colptr - baseval; + + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; + col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval; if ( row != col ) { - y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ]; - y[ col * incy ] += alpha * conjAt_fct( *values ) * x[ row * incx ]; + __spm_zmatvec_dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, conjAt_fct, alpha ); } else { - y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ]; + __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha ); } + values += dofi*dofj; } return SPM_SUCCESS; } +/** + * @brief Build a local dofs array which corresponds + * to the local numerotation of a global index for + * variadic dofs. + */ +static inline spm_int_t * +__spm_zmatvec_dofs_local( const spm_int_t *dofs, + const spm_int_t *glob2loc, + spm_int_t gN) +{ + spm_int_t i, acc = 0; + spm_int_t *result, *resptr; + + result = calloc( gN , sizeof(spm_int_t) ); + resptr = result; + for ( i = 0; i < gN; i++, glob2loc++, resptr++ ) + { + if( *glob2loc >= 0 ) { + *resptr = acc; + acc += dofs[i+1] - dofs[i]; + } + } + return result; +} + +/** + * @brief Compute A*x[i:, j] = y[i:, j] + * for a IJV general matrix + */ static inline int __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args ) { @@ -209,20 +328,68 @@ __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args ) const spm_int_t *rowptr = args->rowptr; const spm_int_t *colptr = args->colptr; const spm_complex64_t *values = args->values; + const spm_int_t *glob2loc = args->loc2glob; + const spm_int_t *dofs = args->dofs; + spm_int_t dof = args->dof; const spm_complex64_t *x = args->x; spm_int_t incx = args->incx; spm_complex64_t *y = args->y; spm_int_t incy = args->incy; const __conj_fct_t conjA_fct = args->conjA_fct; - spm_int_t col, row, i; + spm_int_t row, col, dofj, dofi; + spm_int_t i, ig, jg; - for( i=0; i<nnz; i++, colptr++, rowptr++, values++ ) - { - row = *rowptr - baseval; - col = *colptr - baseval; + spm_int_t *dof_local = NULL; - y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ]; + if( (dofs != NULL) && (glob2loc != NULL) ) { + dof_local = __spm_zmatvec_dofs_local( dofs, glob2loc, args->gN ); } + + if( args->follow_x ) { + for( i=0; i<nnz; i++, colptr++, rowptr++ ) + { + ig = *rowptr - baseval; + jg = *colptr - baseval; + + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; + if (glob2loc == NULL) { + col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval; + } + else { + col = ( dof > 0 ) ? dof * glob2loc[jg] : dof_local[jg]; + } + __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha ); + values += dofi*dofj; + } + } + else { + for( i=0; i<nnz; i++, colptr++, rowptr++ ) + { + ig = *rowptr - baseval; + jg = *colptr - baseval; + + dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg]; + dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig]; + + col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval; + if ( glob2loc == NULL ) { + row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval; + } + else { + row = ( dof > 0 ) ? dof * glob2loc[ig] : dof_local[ig]; + } + __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha ); + values += dofi*dofj; + } + } + + if(dof_local != NULL) { + free(dof_local); + } + return SPM_SUCCESS; } @@ -276,10 +443,14 @@ __spm_zmatvec_args_init( __spm_zmatvec_t *args, args->baseval = spmFindBase( A ); args->n = A->n; args->nnz = A->nnz; + args->gN = A->gN; args->alpha = alpha; args->rowptr = A->rowptr; args->colptr = A->colptr; args->values = A->values; + args->loc2glob = A->loc2glob; + args->dof = A->dof; + args->dofs = A->dofs; args->x = B; args->incx = incx; args->y = C; @@ -312,31 +483,39 @@ __spm_zmatvec_args_init( __spm_zmatvec_t *args, case SpmCSC: { /* Switch pointers and side to get the correct behaviour */ - if ( ((side == SpmLeft) && (transA == SpmNoTrans)) || - ((side == SpmRight) && (transA != SpmNoTrans)) ) - { - args->follow_x = 1; - } - else { - args->follow_x = 0; + if( A->mtxtype == SpmGeneral ) { + if ( ((side == SpmLeft) && (transA == SpmNoTrans)) || + ((side == SpmRight) && (transA != SpmNoTrans)) ) + { + args->follow_x = 1; + } + else { + args->follow_x = 0; + } } - args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csc : __spm_zmatvec_sy_csc; + args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csx : __spm_zmatvec_sy_csx; } break; case SpmCSR: { /* Switch pointers and side to get the correct behaviour */ - if ( ((side == SpmLeft) && (transA != SpmNoTrans)) || - ((side == SpmRight) && (transA == SpmNoTrans)) ) - { - args->follow_x = 1; + if( A->mtxtype == SpmGeneral ) { + if ( ((side == SpmLeft) && (transA != SpmNoTrans)) || + ((side == SpmRight) && (transA == SpmNoTrans)) ) + { + args->follow_x = 1; + } + else { + args->follow_x = 0; + } } else { - args->follow_x = 0; + args->follow_x = 1; } + args->colptr = A->rowptr; args->rowptr = A->colptr; - args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csc : __spm_zmatvec_sy_csr; + args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csx : __spm_zmatvec_sy_csx; } break; case SpmIJV: @@ -349,7 +528,11 @@ __spm_zmatvec_args_init( __spm_zmatvec_t *args, args->conjAt_fct = tmp_fct; args->colptr = A->rowptr; args->rowptr = A->colptr; + args->follow_x = 0; + } else { + args->follow_x = 1; } + args->loc2glob = A->glob2loc; args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_ijv : __spm_zmatvec_sy_ijv; } break; @@ -360,6 +543,105 @@ __spm_zmatvec_args_init( __spm_zmatvec_t *args, return SPM_SUCCESS; } +/** + ******************************************************************************* + * + * @ingroup spm_dev_matvec + * + * @brief Build a global C RHS, set to 0 for remote datas. + * + ******************************************************************************* + * + * @param[in] spm + * The pointer to the sparse matrix structure. + * + * @param[in] Cloc + * The local C vector. + * + * @param[inout] ldc + * The leading dimension of the local C vector. + * Will be updated to corresponds to the global one. + * + * @param[in] nrhs + * The number of RHS. + * + ******************************************************************************* + * + * @return A global C vector which stores local datas and set remote datas to 0. + * + *******************************************************************************/ +static inline spm_complex64_t * +z_spmm_build_Ctmp( const spmatrix_t *spm, + const spm_complex64_t *Cloc, + spm_int_t *ldc, + int nrhs ) +{ + spm_complex64_t *Ctmp; + spm_complex64_t *Cptr = (spm_complex64_t *)Cloc; + spm_int_t i, j, idx; + spm_int_t ig, dof, baseval, *loc2glob; + + Ctmp = calloc(spm->gNexp * nrhs, sizeof(spm_complex64_t)); + *ldc = spm->gNexp; + + baseval = spmFindBase(spm); + for ( j = 0; j < nrhs; j++ ) + { + loc2glob = spm->loc2glob; + for ( i = 0; i < spm->n; i++, loc2glob++ ) + { + ig = *loc2glob - baseval; + dof = (spm->dof > 0) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig]; + idx = (spm->dof > 0) ? spm->dof * ig : spm->dofs[ig] - baseval; + memcpy( (Ctmp + j * spm->gNexp + idx), + Cptr, + dof * sizeof(spm_complex64_t) ); + Cptr += dof; + } + } + return Ctmp; +} + +/** + ******************************************************************************* + * + * @ingroup spm_dev_matvec + * + * @brief Build a global B vector by gathering datas from all nodes. + * + ******************************************************************************* + * + * @param[in] spm + * The pointer to the sparse matrix structure. + * + * @param[in] Bloc + * The local B vector. + * + * @param[inout] ldb + * The leading dimension of the local B vector. + * Will be updated to corresponds to the global one. + * + * @param[in] nrhs + * The number of RHS. + * + ******************************************************************************* + * + * @return The gathered Btmp vector. + * + *******************************************************************************/ +static inline spm_complex64_t * +z_spmm_build_Btmp( const spmatrix_t *spm, + const spm_complex64_t *Bloc, + spm_int_t *ldb, + int nrhs ) +{ + spm_complex64_t *Btmp; + + Btmp = z_spmGatherRHS( spm, nrhs, Bloc, *ldb, -1 ); + *ldb = spm->gNexp; + return Btmp; +} + /** ******************************************************************************* * @@ -457,8 +739,10 @@ spm_zspmm( spm_side_t side, spm_int_t ldc ) { int rc = SPM_SUCCESS; + int distribution; spm_int_t M, N, ldx, ldy, r; __spm_zmatvec_t args; + spm_complex64_t *Ctmp, *Btmp; if ( transB != SpmNoTrans ) { fprintf(stderr, "transB != SpmNoTrans not supported yet in spmv computations\n"); @@ -467,18 +751,18 @@ spm_zspmm( spm_side_t side, } if ( side == SpmLeft ) { - M = A->n; + M = A->nexp; N = K; - ldx = ldb; - ldy = ldc; + ldx = ldb; + ldy = ldc; } else { M = K; - N = A->n; + N = A->nexp; - ldx = 1; - ldy = 1; + ldx = 1; + ldy = 1; } if ( beta == 0. ) { @@ -492,15 +776,45 @@ spm_zspmm( spm_side_t side, return SPM_SUCCESS; } + Btmp = (spm_complex64_t*)B; + Ctmp = C; + distribution = spm_get_distribution(A); + if ( distribution != ( SpmDistByColumn | SpmDistByRow ) ) { + + if ( A->mtxtype != SpmGeneral ) { + Btmp = z_spmm_build_Btmp( A, B, &ldb, N ); + Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N ); + } + else { + if( ( (transA != SpmNoTrans) && (distribution == 1) ) || + ( (transA == SpmNoTrans) && (distribution == 2) ) ) { + Btmp = z_spmm_build_Btmp( A, B, &ldb, N ); + } + if( ( (transA == SpmNoTrans) && (distribution == 1) ) || + ( (transA != SpmNoTrans) && (distribution == 2) ) ) { + Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N ); + } + } + } + __spm_zmatvec_args_init( &args, side, transA, - alpha, A, B, ldb, C, ldc ); + alpha, A, Btmp, ldb, Ctmp, ldc ); for( r=0; (r < N) && (rc == SPM_SUCCESS); r++ ) { - args.x = B + r * ldx; - args.y = C + r * ldy; + args.x = Btmp + r * ldx; + args.y = Ctmp + r * ldy; rc = args.loop_fct( &args ); } + if ( Ctmp != C ) { + z_spmReduceRHS( A, N, Ctmp, A->gNexp, C, A->nexp ); + free( Ctmp ); + } + + if ( Btmp != B ) { + free( Btmp ); + } + return rc; } @@ -558,23 +872,54 @@ spm_zspmv( spm_trans_t trans, spm_int_t incy ) { int rc = SPM_SUCCESS; + int distribution; __spm_zmatvec_t args; + spm_complex64_t *ytmp, *xtmp; if ( beta == 0. ) { - memset( y, 0, A->n * sizeof(spm_complex64_t) ); + memset( y, 0, A->nexp * sizeof(spm_complex64_t) ); } else { - cblas_zscal( A->n, CBLAS_SADDR(beta), y, incy ); + cblas_zscal( A->nexp, CBLAS_SADDR(beta), y, incy ); } if ( alpha == 0. ) { return SPM_SUCCESS; } - __spm_zmatvec_args_init( &args, SpmLeft, trans, - alpha, A, x, incx, y, incy ); + xtmp = (spm_complex64_t*)x; + ytmp = y; + distribution = spm_get_distribution(A); + if ( distribution != ( SpmDistByColumn | SpmDistByRow ) ){ + + if ( A->mtxtype != SpmGeneral ) { + xtmp = z_spmm_build_Btmp( A, x, &incx, 1 ); + ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 ); + } + else { + if( ( (trans != SpmNoTrans) && (distribution == 1) ) || + ( (trans == SpmNoTrans) && (distribution == 2) ) ) { + xtmp = z_spmm_build_Btmp( A, x, &incx, 1 ); + } + if( ( (trans == SpmNoTrans) && (distribution == 1) ) || + ( (trans != SpmNoTrans) && (distribution == 2) ) ) { + ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 ); + } + } + } + __spm_zmatvec_args_init( &args, SpmLeft, trans, + alpha, A, xtmp, incx, ytmp, incy ); rc = args.loop_fct( &args ); + if ( ytmp != y ) { + z_spmReduceRHS( A, 1, ytmp, A->gNexp, y, A->nexp ); + free( ytmp ); + } + + if ( xtmp != x ) { + free( xtmp ); + } + return rc; } diff --git a/src/z_spm_print.c b/src/z_spm_print.c index 7ba3c73e5059470899c4840bfc6df03207278ffb..a462f0ac257bd5b580ad933a46ad36bda32ce00e 100644 --- a/src/z_spm_print.c +++ b/src/z_spm_print.c @@ -460,15 +460,15 @@ z_spmPrint( FILE *f, const spmatrix_t *spm ) *******************************************************************************/ void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, - int n, const void *x, spm_int_t ldx ) + int nrhs, const void *x, spm_int_t ldx ) { const spm_complex64_t *xptr = (const spm_complex64_t *)x; spm_int_t i, j, ig, baseval; baseval = spmFindBase( spm ); - for( j=0; j<n; j++) { - for( i=0; i<spm->n; i++, xptr++ ) { + for( j=0; j<nrhs; j++) { + for( i=0; i < spm->nexp; i++, xptr++ ) { ig = (spm->loc2glob == NULL) ? i : spm->loc2glob[i] - baseval; z_spmPrintElt( f, ig, j, *xptr ); diff --git a/src/z_spm_reduce_rhs.c b/src/z_spm_reduce_rhs.c index 69e96fb3dfd0df5e1cec86b81c2a51500659c993..2a331d97c8a08d9f7dea4e986d56973776b3d28b 100644 --- a/src/z_spm_reduce_rhs.c +++ b/src/z_spm_reduce_rhs.c @@ -9,8 +9,7 @@ * * @version 1.0.0 * @author Delarue Tony - * @author Faverge Mathieu - * @date 2020-06-11 + * @date 2020-03-08 * * @precisions normal z -> c d s * @@ -21,135 +20,65 @@ /** ******************************************************************************* * - * @brief Reduce all the global C coefficients and store the good ones in local + * @brief Reduce all the global coefficients of a rhs and store the local ones * ******************************************************************************* * * @param[in] spm * The sparse matrix spm * - * @param[inout] glob - * The global vector to reduce + * @param[in] nrhs + * Number of rhs vectors. * - * @param[inout] loc - * Local vector + * @param[inout] bglob + * The global rhs vector to reduce. + * + * @param[inout] bloc + * Local rhs vector. + * + * @param[inout] ldb + * Leading dimension of the global b vector. * *******************************************************************************/ -spm_complex64_t * +void z_spmReduceRHS( const spmatrix_t *spm, - int nrhs, - const spm_complex64_t *x, - spm_int_t ldx, - int root ) + int nrhs, + spm_complex64_t *bglob, + spm_int_t ldbglob, + spm_complex64_t *b, + spm_int_t ldb ) { - spm_complex64_t *outptr, *out = NULL; - int c; - int n, nmax = 0; - int nexp, nexpmax = 0; - spm_complex64_t *tmp_out, *current_out; - spm_int_t *tmp_l2g, *current_l2g; - spm_int_t current_n, current_nexp; - spm_int_t i, j, k, ig, dofi, row; - int local = ( ( root == -1 ) || (root == spm->clustnum) ); - spm_int_t baseval = spmFindBase( spm ); - - /* We do not handle cases where ldx is different from spm->n */ - assert( ldx == spm->nexp ); +#if defined(SPM_WITH_MPI) + spm_int_t i, j, k; + spm_int_t ig, dofi, row, baseval; + spm_int_t *loc2glob; + spm_complex64_t *rhs = b; if ( spm->loc2glob == NULL ) { - if ( local ) { - out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); - memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); - } - return out; + return; } -#if !defined(SPM_WITH_MPI) - assert( 0 ); -#else - n = spm->n; - nexp = spm->nexp; - if ( root == -1 ) { - MPI_Allreduce( &n, &nmax, 1, MPI_INT, MPI_MAX, spm->comm ); - MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm ); - } - else { - MPI_Reduce( &n, &nmax, 1, MPI_INT, MPI_MAX, root, spm->comm ); - MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm ); - } - - if ( local ) { - out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) ); - tmp_out = malloc( nexpmax * nrhs * sizeof( spm_complex64_t ) ); - tmp_l2g = malloc( nmax * sizeof( spm_int_t ) ); - } - - for( c=0; c<spm->clustnbr; c++ ) { - MPI_Status status; - - if ( c == spm->clustnum ) { - current_n = spm->n; - current_nexp = spm->nexp; - current_l2g = spm->loc2glob; - current_out = (spm_complex64_t*)x; - } - else { - current_n = 0; - current_nexp = 0; - current_l2g = tmp_l2g; - current_out = tmp_out; - } + MPI_Allreduce( MPI_IN_PLACE, bglob, ldbglob * nrhs, SPM_MPI_COMPLEX64, MPI_SUM, spm->comm ); - if ( root == -1 ) { - MPI_Bcast( ¤t_n, 1, SPM_MPI_INT, c, spm->comm ); - MPI_Bcast( ¤t_nexp, 1, SPM_MPI_INT, c, spm->comm ); - if ( current_n > 0 ) { - MPI_Bcast( current_l2g, current_n, SPM_MPI_INT, c, spm->comm ); - MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm ); + baseval = spmFindBase( spm ); + loc2glob = spm->loc2glob; + for( i=0; i<spm->n; i++, loc2glob++ ) { + ig = *loc2glob - baseval; + dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig]; + row = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval; + for( j=0; j<nrhs; j++ ) { + for( k=0; k<dofi; k++ ) { + rhs[ j * ldb + k ] = bglob[ row + j * ldbglob + k ]; } } - else if ( root != c ) { /* No communication if c == root */ - if ( c == spm->clustnum ) { - MPI_Send( ¤t_n, 1, SPM_MPI_INT, root, 0, spm->comm ); - MPI_Send( ¤t_nexp, 1, SPM_MPI_INT, root, 1, spm->comm ); - if ( current_n > 0 ) { - MPI_Send( current_l2g, current_n, SPM_MPI_INT, root, 2, spm->comm ); - MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm ); - } - } - if ( root == spm->clustnum ) { - MPI_Recv( ¤t_n, 1, SPM_MPI_INT, c, 0, spm->comm, &status ); - MPI_Recv( ¤t_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status ); - if ( current_n > 0 ) { - MPI_Recv( current_l2g, current_n, SPM_MPI_INT, c, 2, spm->comm, &status ); - MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status ); - } - } - } - - if ( !local ) { - continue; - } - - outptr = out; - for( i=0; i<current_n; i++, current_l2g++ ) { - ig = *current_l2g - baseval; - dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig]; - row = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval; - for( j=0; j<nrhs; j++ ) { - for( k=0; k<dofi; k++ ) { - outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ]; - } - } - current_out += dofi; - } - } - - if ( local ) { - free( tmp_out ); - free( tmp_l2g ); + rhs += dofi; } - +#else + (void)spm; + (void)nrhs; + (void)bglob; + (void)ldbglob; + (void)b; + (void)ldb; #endif - return out; -} +} \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 311bff7fd1adae3dd94ea4dede2e4db8d473189d..815e11c4c9d80872f9f773bf36822a3314b6a07b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,6 +53,7 @@ if ( SPM_WITH_MPI ) spm_scatter_gather_tests.c spm_dist_norm_tests.c spm_dist_genrhs_tests.c + spm_dist_matvec_tests.c ) endif() @@ -70,7 +71,9 @@ set( SPM_DOF_TESTS set( SPM_MPI_TESTS spm_scatter_gather_tests spm_dist_norm_tests - spm_dist_genrhs_tests ) + spm_dist_genrhs_tests + spm_dist_matvec_tests + ) # List of run types set( RUNTYPE shm ) diff --git a/tests/spm_compare.c b/tests/spm_compare.c index d16adad322d211742633ae52c8f427b121da4db7..123c6b071cf9685118cb593233083bf49b5a0243 100644 --- a/tests/spm_compare.c +++ b/tests/spm_compare.c @@ -138,7 +138,9 @@ int spmCompare( spmatrix_t *spm1, spmatrix_t *spm2 ) { - spm_int_t i, base1, base2; + spm_int_t i; + spm_int_t base1 = -1; + spm_int_t base2 = -2; int rc = 0; /* I don't have one of the matrices, I jump to the findbase computations */ diff --git a/tests/spm_dist_genrhs_tests.c b/tests/spm_dist_genrhs_tests.c index a0fb79f6b2319868748f97640024983d74a94058..26d0c46e15bd70a66b94d585e88ff56b653e5740 100644 --- a/tests/spm_dist_genrhs_tests.c +++ b/tests/spm_dist_genrhs_tests.c @@ -149,7 +149,7 @@ int main (int argc, char **argv) printf( "SUCCESS\n" ); } else { - printf( "FAILURE\n" ); + printf( "FAILED\n" ); } } err = (rc != 0) ? err+1 : err; diff --git a/tests/spm_dist_matvec_tests.c b/tests/spm_dist_matvec_tests.c new file mode 100644 index 0000000000000000000000000000000000000000..7d2e270bdbe19e0a1f85bed61850c39642ef5b32 --- /dev/null +++ b/tests/spm_dist_matvec_tests.c @@ -0,0 +1,204 @@ +/** + * + * @file spm_dist_matvec_tests.c + * + * Tests and validate the spm_matvec routines. + * + * @copyright 2015-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + * @version 6.0.0 + * @author Mathieu Faverge + * @author Theophile Terraz + * @author Tony Delarue + * @date 2020-05-19 + * + **/ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include <string.h> +#include <assert.h> +#include <time.h> +#include "spm_tests.h" + +#if !defined(SPM_WITH_MPI) +#error "This test should not be compiled in non distributed version" +#endif + +#define PRINT_RES(_ret_) \ + if(_ret_) { \ + printf("FAILED(%d)\n", _ret_); \ + err++; \ + } \ + else { \ + printf("SUCCESS\n"); \ + } + +int main (int argc, char **argv) +{ + char *filename; + spmatrix_t original, *origdist, *spm; + spm_driver_t driver; + int clustnbr = 1; + int clustnum = 0; + spm_mtxtype_t mtxtype; + spm_fmttype_t fmttype; + spm_trans_t trans; + int baseval, distbycol; + int rc = SPM_SUCCESS; + int err = 0; + int dof, dofmax = 4; + int to_free = 0; + + MPI_Init( &argc, &argv ); + + /** + * Get options from command line + */ + spmGetOptions( argc, argv, + &driver, &filename ); + + rc = spmReadDriver( driver, filename, &original ); + free(filename); + + if ( rc != SPM_SUCCESS ) { + fprintf(stderr, "ERROR: Could not read the file, stop the test !!!\n"); + return EXIT_FAILURE; + } + + MPI_Comm_size( MPI_COMM_WORLD, &clustnbr ); + MPI_Comm_rank( MPI_COMM_WORLD, &clustnum ); + + if ( original.flttype == SpmPattern ) { + spmGenFakeValues( &original ); + } + + spmPrintInfo( &original, stdout ); + + if ( clustnum == 0 ) { + printf(" -- SPM Matrix-Vector Test --\n"); + } + + for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) { + + distbycol = (fmttype == SpmCSR) ? 0 : 1; + if ( spmConvert( fmttype, &original ) != SPM_SUCCESS ) { + fprintf( stderr, "Issue to convert to %s format\n", fmtnames[fmttype] ); + continue; + } + + origdist = spmScatter( &original, -1, NULL, distbycol, -1, MPI_COMM_WORLD ); + if ( origdist == NULL ) { + fprintf( stderr, "Failed to scatter the spm\n" ); + continue; + } + + for( dof=-1; dof<2; dof++ ) + { + if ( dof >= 0 ) { + spm = spmDofExtend( origdist, dof, dofmax ); + to_free = 1; + } + else { + spm = malloc(sizeof(spmatrix_t)); + memcpy( spm, origdist, sizeof(spmatrix_t) ); + to_free = 0; + } + + if ( spm == NULL ) { + fprintf( stderr, "Issue to extend the matrix\n" ); + continue; + } + + for( baseval=0; baseval<2; baseval++ ) + { + spmBase( spm, baseval ); + + for( mtxtype=SpmGeneral; mtxtype<=SpmHermitian; mtxtype++ ) + { + if ( (mtxtype == SpmHermitian) && + ( ((original.flttype != SpmComplex64) && + (original.flttype != SpmComplex32)) || + (original.mtxtype != SpmHermitian) ) ) + { + continue; + } + + if ( (mtxtype != SpmGeneral) && + (original.mtxtype == SpmGeneral) ) + { + continue; + } + + spm->mtxtype = mtxtype; + + for( trans=SpmNoTrans; trans<=SpmConjTrans; trans++ ) + { + if ( (trans == SpmConjTrans) && + ((spm->flttype != SpmComplex64) && (spm->flttype != SpmComplex32))) + { + continue; + } + + if(clustnum == 0) { + printf( " Case: %s / %s / dof(%8s) / base(%d) / %10s / %9s : ", + fltnames[spm->flttype], + fmtnames[spm->fmttype], + dofname[dof+1], + baseval, + mtxnames[mtxtype - SpmGeneral], + transnames[trans - SpmNoTrans] ); + } + + switch( spm->flttype ){ + case SpmComplex64: + rc = z_spm_dist_matvec_check( baseval, trans, spm ); + break; + + case SpmComplex32: + rc = c_spm_dist_matvec_check( baseval, trans, spm ); + break; + + case SpmFloat: + rc = s_spm_dist_matvec_check( baseval, trans, spm ); + break; + + case SpmDouble: + default: + rc = d_spm_dist_matvec_check( baseval, trans, spm ); + } + err = (rc != 0) ? err+1 : err; + } + } + } + + if ( spm != origdist ) { + if( to_free ){ + spmExit( spm ); + } + free( spm ); + } + } + + spmExit( origdist ); + free( origdist ); + } + + spmExit(&original); + + MPI_Finalize(); + + if( err == 0 ) { + if(clustnum == 0) { + printf(" -- All tests PASSED --\n"); + } + return EXIT_SUCCESS; + } + else + { + printf(" -- %d tests FAILED --\n", err); + return EXIT_FAILURE; + } +} diff --git a/tests/spm_dist_norm_tests.c b/tests/spm_dist_norm_tests.c index 8c7c7dd36f2fbefdf07874a06784e2eed71439ba..1369f92e1e4ed3d26b56b61aefe1a7860b4b5bcc 100644 --- a/tests/spm_dist_norm_tests.c +++ b/tests/spm_dist_norm_tests.c @@ -67,7 +67,9 @@ int main (int argc, char **argv) spmPrintInfo( &original, stdout ); - printf(" -- SPM Norms Test --\n"); + if ( clustnum == 0 ) { + printf(" -- SPM Norms Test --\n"); + } for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) { @@ -128,7 +130,7 @@ int main (int argc, char **argv) } spmdist->mtxtype = mtxtype; - if(clustnum == 0) { + if ( clustnum == 0 ) { printf( " Case: %s / %s / %d / %s / %d\n", fltnames[spmdist->flttype], fmtnames[spmdist->fmttype], baseval, diff --git a/tests/spm_tests.h b/tests/spm_tests.h index 7c6a2e496c4585bfb04eb89b51c7c8c5f7fbe006..85dd17ab26f7f2bd503d492d906a5476f3e62ce6 100644 --- a/tests/spm_tests.h +++ b/tests/spm_tests.h @@ -85,6 +85,7 @@ int z_spm_norm_check( const spmatrix_t *spm ); int z_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist ); int z_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs, const spm_complex64_t *bloc, const spm_complex64_t *bdst, int root ); +int z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm ); void c_spm_print_check( char *filename, const spmatrix_t *spm ); int c_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm ); @@ -92,6 +93,7 @@ int c_spm_norm_check( const spmatrix_t *spm ); int c_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist ); int c_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs, const spm_complex32_t *bloc, const spm_complex32_t *bdst, int root ); +int c_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm ); void d_spm_print_check( char *filename, const spmatrix_t *spm ); int d_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm ); @@ -99,6 +101,7 @@ int d_spm_norm_check( const spmatrix_t *spm ); int d_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist ); int d_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs, const double *bloc, const double *bdst, int root ); +int d_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm ); void s_spm_print_check( char *filename, const spmatrix_t *spm ); int s_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm ); @@ -106,6 +109,7 @@ int s_spm_norm_check( const spmatrix_t *spm ); int s_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist ); int s_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs, const float *bloc, const float *bdst, int root ); +int s_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm ); void p_spm_print_check( char *filename, const spmatrix_t *spm ); diff --git a/tests/z_spm_tests.c b/tests/z_spm_tests.c index dcbfebc3f47ebacb3497a118d3ebe0871eb0c9d1..c53a0900c9c845312b8f437c1456d24b152c6b0f 100644 --- a/tests/z_spm_tests.c +++ b/tests/z_spm_tests.c @@ -353,7 +353,7 @@ z_spm_dist_genrhs_check( const spmatrix_t *spm, norm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', M, nrhs, bloc, M ); /* Let's gather the distributed RHS */ - tmp = z_spmReduceRHS( spm, nrhs, bdst, spm->nexp, root ); + tmp = z_spmGatherRHS( spm, nrhs, bdst, spm->nexp, root ); rc = 0; if ( tmp != NULL ) { @@ -385,4 +385,116 @@ z_spm_dist_genrhs_check( const spmatrix_t *spm, return ret; } + +/*------------------------------------------------------------------------ + * Check the accuracy of the solution + */ +int +z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm ) +{ + spmatrix_t *spmloc; + unsigned long long int seed = 35469; + unsigned long long int seedx = 24632; + unsigned long long int seedy = 73246; + spm_complex64_t *x, *y, *yd; + /* + * Alpha and beta are complex for cblas, but only the real part is used for + * matvec/matmat subroutines + */ + double dalpha = 0.; + double dbeta = 0.; + + double Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm; + double eps, result; + int rc, info_solution, start = 1; + spm_int_t ldl, ldd, nrhs = 1; + + eps = LAPACKE_dlamch_work('e'); + + core_dplrnt( 1, 1, &dalpha, 1, 1, start, 0, seed ); start++; + core_dplrnt( 1, 1, &dbeta, 1, 1, start, 0, seed ); start++; + + ldd = spm->nexp; + ldl = spm->gNexp; + + /* Generate random x and y in distributed */ + x = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) ); + z_spmRhsGenRndDist( spm, baseval, 1., nrhs, x, spm->nexp, 1, seedx ); + + y = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) ); + z_spmRhsGenRndDist( spm, baseval, 1., nrhs, y, spm->nexp, 1, seedy ); + + /* Compute the distributed sparse matrix-vector product */ + rc = spmMatMat( trans, nrhs, dalpha, spm, x, ldd, dbeta, y, ldd ); + if ( rc != SPM_SUCCESS ) { + info_solution = 1; + goto end; + } + + /* Compute matrix norm and gather info */ + Anorm = spmNorm( SpmInfNorm, spm ); + spmloc = spmGather( spm, 0 ); + yd = z_spmGatherRHS( spm, nrhs, y, ldd, 0 ); + + /* Compute the local sparse matrix-vector product */ + if ( spm->clustnum == 0 ) { + spm_complex64_t *xl, *yl; + + /* Generate xl and yl as x and y locally on 0 */ + xl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) ); + z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, xl, spm->nexp, 1, seedx ); + + yl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) ); + z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, yl, spm->nexp, 1, seedy ); + + /* Compute the original norms */ + Xnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, xl, ldl ); + Ynorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl ); + + rc = spmMatMat( trans, nrhs, dalpha, spmloc, xl, ldl, dbeta, yl, ldl ); + if ( rc != SPM_SUCCESS ) { + info_solution = 1; + goto end; + } + + /* Compute the final norm in shared memory */ + Ylnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl ); + Ydnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yd, ldl ); + + core_zgeadd( SpmNoTrans, ldl, nrhs, + -1., yl, ldl, + 1., yd, ldl ); + Rnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', ldl, nrhs, yd, ldl ); + + + result = Rnorm / ( (Anorm + Xnorm + Ynorm) * spm->gNexp * eps ); + if ( isinf(Ydnorm) || isinf(Ylnorm) || + isnan(result) || isinf(result) || (result > 10.0) ) + { + info_solution = 1; + printf( "FAILED !\n" ); + /* " ||A||_inf = %e, ||x||_inf = %e, ||y||_inf = %e\n" */ + /* " ||shm(a*A*x+b*y)||_inf = %e, ||dist(a*A*x+b*y)||_inf = %e, ||R||_m = %e\n", */ + /* Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm ); */ + } + else { + info_solution = 0; + printf("SUCCESS !\n"); + } + + free( xl ); + free( yl ); + free( yd ); + spmExit( spmloc ); + free( spmloc ); + } + + MPI_Bcast( &info_solution, 1, MPI_INT, 0, spm->comm ); + + end: + free( x ); + free( y ); + + return info_solution; +} #endif