From 89ef88bb082c943dc4702b419f8e05fcd7aaf6a6 Mon Sep 17 00:00:00 2001
From: tdelarue <tony.delarue@inria.fr>
Date: Wed, 25 Mar 2020 17:42:08 +0100
Subject: [PATCH] spmm and spmv are now distributed. Dofless

---
 CMakeLists.txt                |   1 +
 include/spm.h                 |   3 +-
 src/spm.c                     |  78 ++++++++++-
 src/z_spm.h                   |  15 ++-
 src/z_spm_gather_rhs.c        | 166 +++++++++++++++++++++++
 src/z_spm_genrhs.c            |   7 +-
 src/z_spm_matrixvector.c      | 241 ++++++++++++++++++++++++++++------
 src/z_spm_print.c             |   6 +-
 src/z_spm_reduce_rhs.c        | 157 ++++++----------------
 tests/CMakeLists.txt          |   5 +-
 tests/spm_compare.c           |   4 +-
 tests/spm_dist_genrhs_tests.c |   2 +-
 tests/spm_dist_matvec_tests.c | 204 ++++++++++++++++++++++++++++
 tests/spm_dist_norm_tests.c   |   6 +-
 tests/spm_tests.h             |   4 +
 tests/z_spm_tests.c           | 114 +++++++++++++++-
 16 files changed, 837 insertions(+), 176 deletions(-)
 create mode 100644 src/z_spm_gather_rhs.c
 create mode 100644 tests/spm_dist_matvec_tests.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f897fcd..fd7753a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(SOURCES
   src/z_spm_dof_extend.c
   src/z_spm_norm.c
   src/z_spm_scal.c
+  src/z_spm_gather_rhs.c
   src/z_spm_reduce_rhs.c
   src/z_spm_convert_to_csc.c
   src/z_spm_convert_to_csr.c
diff --git a/include/spm.h b/include/spm.h
index 0f46e18f..5d52866c 100644
--- a/include/spm.h
+++ b/include/spm.h
@@ -116,6 +116,7 @@ spmatrix_t *spmScatter( const spmatrix_t *spm,
                         SPM_Comm          comm );
 spmatrix_t *spmGather ( const spmatrix_t *spm,
                               int         root );
+int spmGetDistribution( const spmatrix_t *spm );
 
 /**
  * @}
@@ -207,7 +208,7 @@ int spmParseLaplacianInfo( const char *    filename,
  */
 void *      spm2Dense( const spmatrix_t *spm );
 void        spmPrint( const spmatrix_t *spm, FILE *f );
-void        spmPrintRHS( const spmatrix_t *spm, int n, const void *x, spm_int_t ldx, FILE *stream );
+void        spmPrintRHS( const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx, FILE *stream );
 void        spmPrintInfo( const spmatrix_t *spm, FILE *f );
 void        spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out );
 spmatrix_t *spmDofExtend( const spmatrix_t *spm, const int type, const int dof );
diff --git a/src/spm.c b/src/spm.c
index b492e714..31ab2b36 100644
--- a/src/spm.c
+++ b/src/spm.c
@@ -128,6 +128,70 @@ spmInitDist( spmatrix_t *spm, SPM_Comm comm )
 #endif /* defined(SPM_WITH_MPI) */
 }
 
+/**
+ *******************************************************************************
+ *
+ * @brief Search the distribution pattern used in the spm structure.
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The sparse matrix structure.
+ *
+ ********************************************************************************
+ *
+ * @return  1 if the distribution is column based.
+ *          0 otherwise.
+ *
+ *******************************************************************************/
+int
+spmGetDistribution( const spmatrix_t *spm )
+{
+    int distribution = 1;
+
+    if( spm->fmttype == SpmCSC ){
+        distribution = 1;
+    }
+    else if ( spm->fmttype == SpmCSR ) {
+        distribution = 0;
+    }
+    else {
+        spm_int_t  i, baseval;
+        spm_int_t *colptr   = spm->colptr;
+        spm_int_t *glob2loc = spm->glob2loc;
+
+        baseval = spmFindBase( spm );
+        assert( glob2loc != NULL );
+        for ( i = 0; i < spm->nnz; i++, colptr++ )
+        {
+            /*
+             * If the global index is not in the local colptr
+             * -> row distribution
+             */
+            if( glob2loc[ *colptr - baseval  ] < 0 ) {
+                distribution = 0;
+                break;
+            }
+        }
+
+#if defined(SPM_WITH_MPI)
+        {
+            int check = 0;
+            MPI_Allreduce( &distribution, &check, 1, MPI_INT,
+                           MPI_SUM, spm->comm );
+            if( distribution == 0) {
+                assert( check == 0 );
+            }
+            else {
+                assert( check == spm->clustnbr );
+            }
+        }
+#endif
+    }
+
+    return distribution;
+}
+
 /**
  *******************************************************************************
  *
@@ -848,6 +912,10 @@ spmCopy( const spmatrix_t *spm )
         newspm->loc2glob = (spm_int_t*)malloc( spm->n * sizeof(spm_int_t) );
         memcpy( newspm->loc2glob, spm->loc2glob, spm->n * sizeof(spm_int_t) );
     }
+    if(spm->glob2loc != NULL) {
+        newspm->glob2loc = (spm_int_t*)malloc( spm->gN * sizeof(spm_int_t) );
+        memcpy( newspm->glob2loc, spm->glob2loc, spm->gN * sizeof(spm_int_t) );
+    }
     if(spm->dofs != NULL) {
         newspm->dofs = (spm_int_t*)malloc( dofsize * sizeof(spm_int_t) );
         memcpy( newspm->dofs, spm->dofs, dofsize * sizeof(spm_int_t) );
@@ -1018,7 +1086,7 @@ spmPrint( const spmatrix_t *spm,
  *******************************************************************************/
 void
 spmPrintRHS( const spmatrix_t *spm,
-             int               n,
+             int               nrhs,
              const void       *x,
              spm_int_t         ldx,
              FILE             *stream )
@@ -1033,16 +1101,16 @@ spmPrintRHS( const spmatrix_t *spm,
         /* Not handled for now */
         break;
     case SpmFloat:
-        s_spmPrintRHS( stream, spm, n, x, ldx );
+        s_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmComplex32:
-        c_spmPrintRHS( stream, spm, n, x, ldx );
+        c_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmComplex64:
-        z_spmPrintRHS( stream, spm, n, x, ldx );
+        z_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmDouble:
-        d_spmPrintRHS( stream, spm, n, x, ldx );
+        d_spmPrintRHS( stream, spm, nrhs, x, ldx );
     }
 
     return;
diff --git a/src/z_spm.h b/src/z_spm.h
index 4fbe1b06..5e0f6860 100644
--- a/src/z_spm.h
+++ b/src/z_spm.h
@@ -75,17 +75,28 @@ spm_int_t z_spmSymmetrize( spmatrix_t *spm );
 
 int              z_spmGenRHS(spm_rhstype_t type, int nrhs, const spmatrix_t *spm, void *x, int ldx, void *b, int ldb );
 int              z_spmCheckAxb( spm_fixdbl_t eps, int nrhs, const spmatrix_t *spm, void *x0, int ldx0, void *b, int ldb, const void *x, int ldx );
-spm_complex64_t *z_spmReduceRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root );
+spm_complex64_t *z_spmGatherRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root );
+void             z_spmReduceRhs( const spmatrix_t *spm, int nrhs, spm_complex64_t *bglob, spm_complex64_t *b, spm_int_t ldb );
 
 /**
  * Output routines
  */
 void z_spmDensePrint( FILE *f, spm_int_t m, spm_int_t n, const spm_complex64_t *A, spm_int_t lda );
 void z_spmPrint( FILE *f, const spmatrix_t *spm );
-void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int n, const void *x, spm_int_t ldx );
+void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx );
 
 void z_spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out );
 void z_spmDofExtend( spmatrix_t *spm );
 void z_spmScal( const double alpha, spmatrix_t *spm );
 
+/**
+ * Subroutines for random vector generation to be used in testings
+ */
+void z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale,
+                        spm_int_t n, spm_complex64_t *A, spm_int_t lda,
+                        int shift, unsigned long long int seed );
+void z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale,
+                         spm_int_t n, spm_complex64_t *A, spm_int_t lda,
+                         int shift, unsigned long long int seed );
+
 #endif /* _z_spm_h_ */
diff --git a/src/z_spm_gather_rhs.c b/src/z_spm_gather_rhs.c
new file mode 100644
index 00000000..ca4b44b1
--- /dev/null
+++ b/src/z_spm_gather_rhs.c
@@ -0,0 +1,166 @@
+/**
+ *
+ * @file z_spm_gather_rhs.c
+ *
+ * SParse Matrix package right hand side gather routine.
+ *
+ * @copyright 2020-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @version 1.0.0
+ * @author Delarue Tony
+ * @author Faverge Mathieu
+ * @date 2020-06-11
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "common.h"
+#include "z_spm.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief Gather all the global C coefficients and store the good ones in local
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The sparse matrix spm
+ *
+ * @param[in] nrhs
+ *          Number of rhs vectors.
+ *
+ * @param[in] x
+ *          Local vector
+ *
+ * @param[in] ldx
+ *          Leading dimension of this vector
+ *
+ * @param[in] root
+ *          Clustnum whre the complete vector will be gathered.
+ *          -1 if you want to gather the data on all nodes.
+ *
+ ********************************************************************************
+ *
+ * @return The gathered right hand side vector.
+ *
+ *******************************************************************************/
+spm_complex64_t *
+z_spmGatherRHS( const spmatrix_t      *spm,
+                int                    nrhs,
+                const spm_complex64_t *x,
+                spm_int_t              ldx,
+                int                    root )
+{
+    int local = ( ( root == -1 ) || (root == spm->clustnum) );
+    spm_complex64_t *out = NULL;
+
+    /* We do not handle cases where ldx is different from spm->n */
+    assert( ldx == spm->nexp );
+
+    if ( spm->loc2glob == NULL ) {
+        if ( local ) {
+            out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+            memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+        }
+        (void)ldx;
+        return out;
+    }
+
+#if defined(SPM_WITH_MPI)
+    int              c;
+    int              n, nmax = 0;
+    int              nexp, nexpmax = 0;
+    spm_complex64_t *tmp_out, *current_out, *outptr;
+    spm_int_t       *tmp_l2g, *current_l2g;
+    spm_int_t        current_n, current_nexp;
+    spm_int_t        i, j, k, ig, dofi, row;
+    spm_int_t        baseval = spmFindBase( spm );
+
+    n    = spm->n;
+    nexp = spm->nexp;
+    if ( root == -1 ) {
+        MPI_Allreduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, spm->comm );
+        MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm );
+    }
+    else {
+        MPI_Reduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, root, spm->comm );
+        MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm );
+    }
+
+    if ( local ) {
+        out     = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+        tmp_out = malloc( nexpmax    * nrhs * sizeof( spm_complex64_t ) );
+        tmp_l2g = malloc( nmax              * sizeof( spm_int_t )       );
+    }
+
+    for( c=0; c<spm->clustnbr; c++ ) {
+        MPI_Status status;
+
+        if ( c == spm->clustnum ) {
+            current_n    = spm->n;
+            current_nexp = spm->nexp;
+            current_l2g  = spm->loc2glob;
+            current_out  = (spm_complex64_t*)x;
+        }
+        else {
+            current_n    = 0;
+            current_nexp = 0;
+            current_l2g  = tmp_l2g;
+            current_out  = tmp_out;
+        }
+
+        if ( root == -1 ) {
+            MPI_Bcast( &current_n,    1, SPM_MPI_INT, c, spm->comm );
+            MPI_Bcast( &current_nexp, 1, SPM_MPI_INT, c, spm->comm );
+            if ( current_n > 0 ) {
+                MPI_Bcast( current_l2g, current_n,           SPM_MPI_INT,       c, spm->comm );
+                MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm );
+            }
+        }
+        else if ( root != c ) { /* No communication if c == root */
+            if ( c == spm->clustnum ) {
+                MPI_Send( &current_n,    1, SPM_MPI_INT, root, 0, spm->comm );
+                MPI_Send( &current_nexp, 1, SPM_MPI_INT, root, 1, spm->comm );
+                if ( current_n > 0 ) {
+                    MPI_Send( current_l2g, current_n,           SPM_MPI_INT,       root, 2, spm->comm );
+                    MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm );
+                }
+            }
+            if ( root == spm->clustnum ) {
+                MPI_Recv( &current_n,    1, SPM_MPI_INT, c, 0, spm->comm, &status );
+                MPI_Recv( &current_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status );
+                if ( current_n > 0 ) {
+                    MPI_Recv( current_l2g, current_n,           SPM_MPI_INT,       c, 2, spm->comm, &status );
+                    MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status );
+                }
+            }
+        }
+
+        if ( !local ) {
+            continue;
+        }
+
+        outptr = out;
+        for( i=0; i<current_n; i++, current_l2g++ ) {
+            ig   = *current_l2g - baseval;
+            dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+            row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
+            for( j=0; j<nrhs; j++ ) {
+                for( k=0; k<dofi; k++ ) {
+                    outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ];
+                }
+            }
+            current_out += dofi;
+        }
+    }
+
+    if ( local ) {
+        free( tmp_out );
+        free( tmp_l2g );
+    }
+
+#endif
+    return out;
+}
diff --git a/src/z_spm_genrhs.c b/src/z_spm_genrhs.c
index f466b8be..54f213cb 100644
--- a/src/z_spm_genrhs.c
+++ b/src/z_spm_genrhs.c
@@ -94,7 +94,7 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) {
  *
  ******************************************************************************/
 static inline void
-z_updateRndVal( double                  scale,
+z_updateRndVal( spm_complex64_t         scale,
                 spm_complex64_t        *val,
                 unsigned long long int *ran )
 {
@@ -265,7 +265,7 @@ z_spmRhsGenI( const spmatrix_t *spm, spm_int_t baseval,
  *         all tiles initialized with this routine.
  *
  ******************************************************************************/
-static inline void
+void
 z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval,
                    spm_complex64_t scale,
                    spm_int_t n, spm_complex64_t *A, spm_int_t lda,
@@ -329,7 +329,7 @@ z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval,
  ******************************************************************************/
 void
 z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval,
-                    double scale,
+                    spm_complex64_t scale,
                     spm_int_t n, spm_complex64_t *A, spm_int_t lda,
                     int shift, unsigned long long int seed )
 {
@@ -368,6 +368,7 @@ z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval,
             }
         }
     }
+    (void)lda;
 }
 
 /**
diff --git a/src/z_spm_matrixvector.c b/src/z_spm_matrixvector.c
index cf6078b0..d8a05c28 100644
--- a/src/z_spm_matrixvector.c
+++ b/src/z_spm_matrixvector.c
@@ -46,6 +46,10 @@ struct __spm_zmatvec_s {
     const spm_int_t       *rowptr;
     const spm_int_t       *colptr;
     const spm_complex64_t *values;
+    const spm_int_t       *loc2glob;
+
+    spm_int_t              dof;
+    const spm_int_t       *dofs;
 
     const spm_complex64_t *x;
     spm_int_t              incx;
@@ -67,26 +71,27 @@ __spm_zmatvec_sy_csr( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr     = args->rowptr;
     const spm_int_t       *colptr     = args->colptr;
     const spm_complex64_t *values     = args->values;
+    const spm_int_t       *loc2glob   = args->loc2glob;
     const spm_complex64_t *x          = args->x;
     spm_int_t              incx       = args->incx;
     spm_complex64_t       *y          = args->y;
     spm_int_t              incy       = args->incy;
     const __conj_fct_t     conjA_fct  = args->conjA_fct;
     const __conj_fct_t     conjAt_fct = args->conjAt_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              col, gcol, grow, i;
 
     for( col=0; col<n; col++, colptr++ )
     {
+        gcol = (loc2glob == NULL) ? col : loc2glob[col] - baseval ;
         for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
         {
-            row = *rowptr - baseval;
-
-            if ( row != col ) {
-                y[ row * incy ] += alpha * conjAt_fct( *values ) * x[ col * incx ];
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
+            grow = *rowptr - baseval;
+            if ( grow != gcol ) {
+                y[ grow * incy ] += alpha * conjAt_fct( *values ) * x[ gcol * incx ];
+                y[ gcol * incy ] += alpha *  conjA_fct( *values ) * x[ grow * incx ];
             }
             else {
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
+                y[ gcol * incy ] += alpha *  conjA_fct( *values ) * x[ grow * incx ];
             }
         }
     }
@@ -102,26 +107,28 @@ __spm_zmatvec_sy_csc( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr     = args->rowptr;
     const spm_int_t       *colptr     = args->colptr;
     const spm_complex64_t *values     = args->values;
+    const spm_int_t       *loc2glob   = args->loc2glob;
     const spm_complex64_t *x          = args->x;
     spm_int_t              incx       = args->incx;
     spm_complex64_t       *y          = args->y;
     spm_int_t              incy       = args->incy;
     const __conj_fct_t     conjA_fct  = args->conjA_fct;
     const __conj_fct_t     conjAt_fct = args->conjAt_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              col, gcol, grow, i;
+
 
     for( col=0; col<n; col++, colptr++ )
     {
+        gcol = (loc2glob == NULL) ? col : loc2glob[col] - baseval ;
         for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
         {
-            row = *rowptr - baseval;
-
-            if ( row != col ) {
-                y[ row * incy ] += alpha *  conjA_fct( *values ) * x[ col * incx ];
-                y[ col * incy ] += alpha * conjAt_fct( *values ) * x[ row * incx ];
+            grow = *rowptr - baseval;
+            if ( grow != gcol ) {
+                y[ grow * incy ] += alpha *  conjA_fct( *values ) * x[ gcol * incx ];
+                y[ gcol * incy ] += alpha * conjAt_fct( *values ) * x[ grow * incx ];
             }
             else {
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
+                y[ gcol * incy ] += alpha *  conjA_fct( *values ) * x[ gcol * incx ];
             }
         }
     }
@@ -137,31 +144,59 @@ __spm_zmatvec_ge_csc( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr    = args->rowptr;
     const spm_int_t       *colptr    = args->colptr;
     const spm_complex64_t *values    = args->values;
+    const spm_int_t       *loc2glob  = args->loc2glob;
+    const spm_int_t       *dofs      = args->dofs;
+    spm_int_t              dof       = args->dof;
     const spm_complex64_t *x         = args->x;
     spm_int_t              incx      = args->incx;
     spm_complex64_t       *y         = args->y;
     spm_int_t              incy      = args->incy;
     const __conj_fct_t     conjA_fct = args->conjA_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              row, dofj, dofi;
+    spm_int_t              i, ii, ig, j, jj, jg,;
 
     if ( args->follow_x ) {
-        for( col=0; col<n; col++, colptr++, x+=incx )
+        for( j = 0; j < n; j++, colptr++ )
         {
-            for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+            jg   = (loc2glob == NULL) ? j : loc2glob[j] - baseval;
+            dofj = ( dof > 0 ) ?      dof : dofs[jg+1] - dofs[jg];
+            for( i=colptr[0]; i<colptr[1]; i++, rowptr++ )
             {
-                row = *rowptr - baseval;
-                y[ row * incy ] += alpha * conjA_fct( *values ) * (*x);
+                ig   = *rowptr - baseval;
+                dofi = ( dof > 0 ) ? dof      : dofs[ig+1] - dofs[ig];
+                row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+
+                for(jj=0; jj<dofj; jj++)
+                   {
+                       for(ii=0; ii<dofi; ii++, values++)
+                       {
+                            y[ row + (ii * incy) ] += alpha * conjA_fct( *values ) * x[ jj ];
+                       }
+                   }
             }
+            x += (dofj * incx);
         }
     }
     else {
-        for( col=0; col<n; col++, colptr++, y+=incy )
+        for( j=0; j<n; j++, colptr++ )
         {
-            for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+            jg   = (loc2glob == NULL) ? j : loc2glob[j] - baseval;
+            dofj = ( dof > 0 ) ?      dof : dofs[jg+1] - dofs[jg];
+            for( i=colptr[0]; i<colptr[1]; i++, rowptr++ )
             {
-                row = *rowptr - baseval;
-                *y += alpha * conjA_fct( *values ) * x[ row * incx ];
+                ig   = *rowptr - baseval;
+                dofi = ( dof > 0 ) ? dof      : dofs[ig+1] - dofs[ig];
+                row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+
+                for ( jj = 0; jj < dofj; jj++)
+                {
+                    for ( ii = 0; ii < dofi; ii++, values++ )
+                    {
+                        y[jj] += alpha * conjA_fct( *values ) * x[ (row + ii) * incx ];
+                    }
+                }
             }
+            y += (dofj * incy);
         }
     }
     return SPM_SUCCESS;
@@ -209,6 +244,7 @@ __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr    = args->rowptr;
     const spm_int_t       *colptr    = args->colptr;
     const spm_complex64_t *values    = args->values;
+    const spm_int_t       *glob2loc  = args->loc2glob;
     const spm_complex64_t *x         = args->x;
     spm_int_t              incx      = args->incx;
     spm_complex64_t       *y         = args->y;
@@ -216,13 +252,25 @@ __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args )
     const __conj_fct_t     conjA_fct = args->conjA_fct;
     spm_int_t              col, row, i;
 
-    for( i=0; i<nnz; i++, colptr++, rowptr++, values++ )
-    {
-        row = *rowptr - baseval;
-        col = *colptr - baseval;
+    if( args->follow_x ) {
+        for( i=0; i<nnz; i++, colptr++, rowptr++, values++ )
+        {
+            row = *rowptr - baseval;
+            col = (glob2loc == NULL) ? *colptr - baseval : glob2loc[ *colptr - baseval ];
 
-        y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ];
+            y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ];
+        }
     }
+    else {
+        for( i=0; i<nnz; i++, colptr++, rowptr++, values++ )
+        {
+            row = (glob2loc == NULL) ? *rowptr - baseval : glob2loc[ *rowptr - baseval ];
+            col = *colptr - baseval;
+
+            y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ];
+        }
+    }
+
     return SPM_SUCCESS;
 }
 
@@ -280,6 +328,9 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
     args->rowptr     = A->rowptr;
     args->colptr     = A->colptr;
     args->values     = A->values;
+    args->loc2glob   = A->loc2glob;
+    args->dof        = A->dof;
+    args->dofs       = A->dofs;
     args->x          = B;
     args->incx       = incx;
     args->y          = C;
@@ -349,7 +400,11 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
             args->conjAt_fct = tmp_fct;
             args->colptr = A->rowptr;
             args->rowptr = A->colptr;
+            args->follow_x = 0;
+        } else {
+            args->follow_x = 1;
         }
+        args->loc2glob = A->glob2loc;
         args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_ijv : __spm_zmatvec_sy_ijv;
     }
     break;
@@ -360,6 +415,49 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
     return SPM_SUCCESS;
 }
 
+static inline spm_complex64_t *
+z_spmm_build_Ctmp( const spmatrix_t      *spm,
+                   const spm_complex64_t *Cloc,
+                         spm_int_t       *ldc,
+                         int              nrhs )
+{
+    spm_complex64_t *Ctmp;
+    spm_int_t i, j, idx;
+    spm_int_t ig, dof, baseval, *loc2glob;
+
+    Ctmp = calloc(spm->gNexp * nrhs, sizeof(spm_complex64_t));
+    *ldc = spm->gNexp;
+
+    baseval = spmFindBase(spm);
+    for ( j = 0; j < nrhs; j++)
+    {
+        loc2glob = spm->loc2glob;
+        for ( i = 0; i < spm->n; i++, loc2glob++)
+        {
+            ig  = *loc2glob - baseval;
+            dof = (spm->dof > 0) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+            idx = (spm->dof > 0) ? spm->dof * ig : spm->dofs[ig] - baseval;
+            memcpy( (Ctmp + j * spm->gNexp + idx),
+                    (Cloc + j * spm->nexp  +   i),
+                    dof * sizeof(spm_complex64_t) );
+        }
+    }
+    return Ctmp;
+}
+
+static inline spm_complex64_t *
+z_spmm_build_Btmp( const spmatrix_t      *spm,
+                   const spm_complex64_t *Bloc,
+                         spm_int_t       *ldb,
+                         int              nrhs )
+{
+    spm_complex64_t *Btmp;
+
+    Btmp = z_spmGatherRHS( spm, nrhs, Bloc, *ldb, -1 );
+    *ldb = spm->gNexp;
+    return Btmp;
+}
+
 /**
  *******************************************************************************
  *
@@ -459,6 +557,7 @@ spm_zspmm( spm_side_t             side,
     int rc = SPM_SUCCESS;
     spm_int_t M, N, ldx, ldy, r;
     __spm_zmatvec_t args;
+    spm_complex64_t *Ctmp, *Btmp;
 
     if ( transB != SpmNoTrans ) {
         fprintf(stderr, "transB != SpmNoTrans not supported yet in spmv computations\n");
@@ -467,18 +566,18 @@ spm_zspmm( spm_side_t             side,
     }
 
     if ( side == SpmLeft ) {
-        M = A->n;
+        M = A->nexp;
         N = K;
 
-        ldx  = ldb;
-        ldy  = ldc;
+        ldx = ldb;
+        ldy = ldc;
     }
     else {
         M = K;
-        N = A->n;
+        N = A->nexp;
 
-        ldx  = 1;
-        ldy  = 1;
+        ldx = 1;
+        ldy = 1;
     }
 
     if ( beta == 0. ) {
@@ -492,15 +591,45 @@ spm_zspmm( spm_side_t             side,
         return SPM_SUCCESS;
     }
 
+    Btmp = (spm_complex64_t*)B;
+    Ctmp = C;
+    if ( A->loc2glob != NULL ) {
+        int distByCol = spmGetDistribution(A);
+
+        if ( A->mtxtype != SpmGeneral ) {
+            Btmp = z_spmm_build_Btmp( A, B, &ldb, N );
+            Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N );
+        }
+        else {
+            if( ( (transA != SpmNoTrans) && (distByCol == 1) ) ||
+                ( (transA == SpmNoTrans) && (distByCol == 0) ) ) {
+                Btmp = z_spmm_build_Btmp( A, B, &ldb, N );
+            }
+            if( ( (transA == SpmNoTrans) && (distByCol == 1) ) ||
+                ( (transA != SpmNoTrans) && (distByCol == 0) ) ) {
+                Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N );
+            }
+        }
+    }
+
     __spm_zmatvec_args_init( &args, side, transA,
-                             alpha, A, B, ldb, C, ldc );
+                             alpha, A, Btmp, ldb, Ctmp, ldc );
 
     for( r=0; (r < N) && (rc == SPM_SUCCESS); r++ ) {
-        args.x = B + r * ldx;
-        args.y = C + r * ldy;
+        args.x = Btmp + r * ldx;
+        args.y = Ctmp + r * ldy;
         rc = args.loop_fct( &args );
     }
 
+    if ( Ctmp != C ) {
+        z_spmReduceRhs( A, N, Ctmp, C, ldc );
+        free( Ctmp );
+    }
+
+    if ( Btmp != B ) {
+        free( Btmp );
+    }
+
     return rc;
 }
 
@@ -559,22 +688,52 @@ spm_zspmv( spm_trans_t            trans,
 {
     int rc = SPM_SUCCESS;
     __spm_zmatvec_t args;
+    spm_complex64_t *ytmp, *xtmp;
 
     if ( beta == 0. ) {
-        memset( y, 0, A->n * sizeof(spm_complex64_t) );
+        memset( y, 0, A->nexp * sizeof(spm_complex64_t) );
     }
     else {
-        cblas_zscal( A->n, CBLAS_SADDR(beta), y, incy );
+        cblas_zscal( A->nexp, CBLAS_SADDR(beta), y, incy );
     }
 
     if ( alpha == 0. ) {
         return SPM_SUCCESS;
     }
 
-    __spm_zmatvec_args_init( &args, SpmLeft, trans,
-                             alpha, A, x, incx, y, incy );
+    xtmp = (spm_complex64_t*)x;
+    ytmp = y;
+    if ( A->loc2glob != NULL ){
+        int distByCol = spmGetDistribution(A);
 
+        if ( A->mtxtype != SpmGeneral ) {
+            xtmp = z_spmm_build_Btmp( A, x, &incx, 1 );
+            ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 );
+        }
+        else {
+            if( ( (trans != SpmNoTrans) && (distByCol == 1) ) ||
+                ( (trans == SpmNoTrans) && (distByCol == 0) ) ) {
+                xtmp = z_spmm_build_Btmp( A, x, &incx, 1 );
+            }
+            if( ( (trans == SpmNoTrans) && (distByCol == 1) ) ||
+                ( (trans != SpmNoTrans) && (distByCol == 0) ) ) {
+                ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 );
+            }
+        }
+    }
+
+    __spm_zmatvec_args_init( &args, SpmLeft, trans,
+                             alpha, A, xtmp, incx, ytmp, incy );
     rc = args.loop_fct( &args );
 
+    if ( ytmp != y ) {
+        z_spmReduceRhs( A, 1, ytmp, y, incy );
+        free(ytmp);
+    }
+
+    if ( xtmp != x ) {
+        free( xtmp );
+    }
+
     return rc;
 }
diff --git a/src/z_spm_print.c b/src/z_spm_print.c
index 7ba3c73e..a462f0ac 100644
--- a/src/z_spm_print.c
+++ b/src/z_spm_print.c
@@ -460,15 +460,15 @@ z_spmPrint( FILE *f, const spmatrix_t *spm )
  *******************************************************************************/
 void
 z_spmPrintRHS( FILE *f, const spmatrix_t *spm,
-               int n, const void *x, spm_int_t ldx )
+               int nrhs, const void *x, spm_int_t ldx )
 {
     const spm_complex64_t *xptr = (const spm_complex64_t *)x;
     spm_int_t i, j, ig, baseval;
 
     baseval = spmFindBase( spm );
 
-    for( j=0; j<n; j++) {
-        for( i=0; i<spm->n; i++, xptr++ ) {
+    for( j=0; j<nrhs; j++) {
+        for( i=0; i < spm->nexp; i++, xptr++ ) {
             ig = (spm->loc2glob == NULL) ? i : spm->loc2glob[i] - baseval;
 
             z_spmPrintElt( f, ig, j, *xptr );
diff --git a/src/z_spm_reduce_rhs.c b/src/z_spm_reduce_rhs.c
index 69e96fb3..f2a7f964 100644
--- a/src/z_spm_reduce_rhs.c
+++ b/src/z_spm_reduce_rhs.c
@@ -9,8 +9,7 @@
  *
  * @version 1.0.0
  * @author Delarue Tony
- * @author Faverge Mathieu
- * @date 2020-06-11
+ * @date 2020-03-08
  *
  * @precisions normal z -> c d s
  *
@@ -21,135 +20,63 @@
 /**
  *******************************************************************************
  *
- * @brief Reduce all the global C coefficients and store the good ones in local
+ * @brief Reduce all the global coefficients  of a rhs and store the local ones
  *
  *******************************************************************************
  *
  * @param[in] spm
  *          The sparse matrix spm
  *
- * @param[inout] glob
- *          The global vector to reduce
+ * @param[in] nrhs
+ *          Number of rhs vectors.
  *
- * @param[inout] loc
- *          Local vector
+ * @param[inout] bglob
+ *          The global rhs vector to reduce.
+ *
+ * @param[inout] bloc
+ *          Local rhs vector.
+ *
+ * @param[inout] ldb
+ *          Leading dimension of the global b  vector.
  *
  *******************************************************************************/
-spm_complex64_t *
-z_spmReduceRHS( const spmatrix_t      *spm,
-                int                    nrhs,
-                const spm_complex64_t *x,
-                spm_int_t              ldx,
-                int                    root )
+void
+z_spmReduceRhs( const spmatrix_t      *spm,
+                      int              nrhs,
+                      spm_complex64_t *bglob,
+                      spm_complex64_t *b,
+                      spm_int_t        ldb )
 {
-    spm_complex64_t *outptr, *out = NULL;
-    int              c;
-    int              n, nmax = 0;
-    int              nexp, nexpmax = 0;
-    spm_complex64_t *tmp_out, *current_out;
-    spm_int_t       *tmp_l2g, *current_l2g;
-    spm_int_t        current_n, current_nexp;
-    spm_int_t        i, j, k, ig, dofi, row;
-    int              local =  ( ( root == -1 ) || (root == spm->clustnum) );
-    spm_int_t        baseval = spmFindBase( spm );
-
-    /* We do not handle cases where ldx is different from spm->n */
-    assert( ldx == spm->nexp );
+#if defined(SPM_WITH_MPI)
+    spm_int_t        i, j, k;
+    spm_int_t        ig, dofi, row, baseval;
+    spm_int_t       *loc2glob;
+    spm_complex64_t *rhs = b;
 
     if ( spm->loc2glob == NULL ) {
-        if ( local ) {
-            out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-            memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-        }
-        return out;
+        return;
     }
 
-#if !defined(SPM_WITH_MPI)
-    assert( 0 );
-#else
-    n    = spm->n;
-    nexp = spm->nexp;
-    if ( root == -1 ) {
-        MPI_Allreduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, spm->comm );
-        MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm );
-    }
-    else {
-        MPI_Reduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, root, spm->comm );
-        MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm );
-    }
-
-    if ( local ) {
-        out     = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-        tmp_out = malloc( nexpmax    * nrhs * sizeof( spm_complex64_t ) );
-        tmp_l2g = malloc( nmax              * sizeof( spm_int_t )       );
-    }
-
-    for( c=0; c<spm->clustnbr; c++ ) {
-        MPI_Status status;
-
-        if ( c == spm->clustnum ) {
-            current_n    = spm->n;
-            current_nexp = spm->nexp;
-            current_l2g  = spm->loc2glob;
-            current_out  = (spm_complex64_t*)x;
-        }
-        else {
-            current_n    = 0;
-            current_nexp = 0;
-            current_l2g  = tmp_l2g;
-            current_out  = tmp_out;
-        }
+    MPI_Allreduce( MPI_IN_PLACE, bglob, ldb * nrhs, SPM_MPI_COMPLEX64, MPI_SUM, spm->comm );
 
-        if ( root == -1 ) {
-            MPI_Bcast( &current_n,    1, SPM_MPI_INT, c, spm->comm );
-            MPI_Bcast( &current_nexp, 1, SPM_MPI_INT, c, spm->comm );
-            if ( current_n > 0 ) {
-                MPI_Bcast( current_l2g, current_n,           SPM_MPI_INT,       c, spm->comm );
-                MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm );
+    baseval  = spmFindBase( spm );
+    loc2glob = spm->loc2glob;
+    for( i=0; i<spm->n; i++, loc2glob++ ) {
+        ig   = *loc2glob - baseval;
+        dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+        row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
+        for( j=0; j<nrhs; j++ ) {
+            for( k=0; k<dofi; k++ ) {
+                rhs[ j * spm->nexp + k ] = bglob[ row + j * ldb + k ];
             }
         }
-        else if ( root != c ) { /* No communication if c == root */
-            if ( c == spm->clustnum ) {
-                MPI_Send( &current_n,    1, SPM_MPI_INT, root, 0, spm->comm );
-                MPI_Send( &current_nexp, 1, SPM_MPI_INT, root, 1, spm->comm );
-                if ( current_n > 0 ) {
-                    MPI_Send( current_l2g, current_n,           SPM_MPI_INT,       root, 2, spm->comm );
-                    MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm );
-                }
-            }
-            if ( root == spm->clustnum ) {
-                MPI_Recv( &current_n,    1, SPM_MPI_INT, c, 0, spm->comm, &status );
-                MPI_Recv( &current_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status );
-                if ( current_n > 0 ) {
-                    MPI_Recv( current_l2g, current_n,           SPM_MPI_INT,       c, 2, spm->comm, &status );
-                    MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status );
-                }
-            }
-        }
-
-        if ( !local ) {
-            continue;
-        }
-
-        outptr = out;
-        for( i=0; i<current_n; i++, current_l2g++ ) {
-            ig   = *current_l2g - baseval;
-            dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
-            row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
-            for( j=0; j<nrhs; j++ ) {
-                for( k=0; k<dofi; k++ ) {
-                    outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ];
-                }
-            }
-            current_out += dofi;
-        }
-    }
-
-    if ( local ) {
-        free( tmp_out );
-        free( tmp_l2g );
+        rhs += dofi;
     }
-
+#else
+    (void)spm;
+    (void)nrhs;
+    (void)bglob;
+    (void)b;
+    (void)ldb;
 #endif
-    return out;
-}
+}
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 311bff7f..815e11c4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -53,6 +53,7 @@ if ( SPM_WITH_MPI )
     spm_scatter_gather_tests.c
     spm_dist_norm_tests.c
     spm_dist_genrhs_tests.c
+    spm_dist_matvec_tests.c
     )
 endif()
 
@@ -70,7 +71,9 @@ set( SPM_DOF_TESTS
 set( SPM_MPI_TESTS
   spm_scatter_gather_tests
   spm_dist_norm_tests
-  spm_dist_genrhs_tests )
+  spm_dist_genrhs_tests
+  spm_dist_matvec_tests
+  )
 
 # List of run types
 set( RUNTYPE shm )
diff --git a/tests/spm_compare.c b/tests/spm_compare.c
index d16adad3..123c6b07 100644
--- a/tests/spm_compare.c
+++ b/tests/spm_compare.c
@@ -138,7 +138,9 @@ int
 spmCompare( spmatrix_t *spm1,
             spmatrix_t *spm2 )
 {
-    spm_int_t i, base1, base2;
+    spm_int_t i;
+    spm_int_t base1 = -1;
+    spm_int_t base2 = -2;
     int rc = 0;
 
     /* I don't have one of the matrices, I jump to the findbase computations */
diff --git a/tests/spm_dist_genrhs_tests.c b/tests/spm_dist_genrhs_tests.c
index a0fb79f6..26d0c46e 100644
--- a/tests/spm_dist_genrhs_tests.c
+++ b/tests/spm_dist_genrhs_tests.c
@@ -149,7 +149,7 @@ int main (int argc, char **argv)
                         printf( "SUCCESS\n" );
                     }
                     else {
-                        printf( "FAILURE\n" );
+                        printf( "FAILED\n" );
                     }
                 }
                 err = (rc != 0) ? err+1 : err;
diff --git a/tests/spm_dist_matvec_tests.c b/tests/spm_dist_matvec_tests.c
new file mode 100644
index 00000000..7d2e270b
--- /dev/null
+++ b/tests/spm_dist_matvec_tests.c
@@ -0,0 +1,204 @@
+/**
+ *
+ * @file spm_dist_matvec_tests.c
+ *
+ * Tests and validate the spm_matvec routines.
+ *
+ * @copyright 2015-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @version 6.0.0
+ * @author Mathieu Faverge
+ * @author Theophile Terraz
+ * @author Tony Delarue
+ * @date 2020-05-19
+ *
+ **/
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include "spm_tests.h"
+
+#if !defined(SPM_WITH_MPI)
+#error "This test should not be compiled in non distributed version"
+#endif
+
+#define PRINT_RES(_ret_)                        \
+    if(_ret_) {                                 \
+        printf("FAILED(%d)\n", _ret_);          \
+        err++;                                  \
+    }                                           \
+    else {                                      \
+        printf("SUCCESS\n");                    \
+    }
+
+int main (int argc, char **argv)
+{
+    char         *filename;
+    spmatrix_t    original, *origdist, *spm;
+    spm_driver_t  driver;
+    int clustnbr = 1;
+    int clustnum = 0;
+    spm_mtxtype_t mtxtype;
+    spm_fmttype_t fmttype;
+    spm_trans_t   trans;
+    int baseval, distbycol;
+    int rc = SPM_SUCCESS;
+    int err = 0;
+    int dof, dofmax = 4;
+    int to_free = 0;
+
+    MPI_Init( &argc, &argv );
+
+    /**
+     * Get options from command line
+     */
+    spmGetOptions( argc, argv,
+                   &driver, &filename );
+
+    rc = spmReadDriver( driver, filename, &original );
+    free(filename);
+
+    if ( rc != SPM_SUCCESS ) {
+        fprintf(stderr, "ERROR: Could not read the file, stop the test !!!\n");
+        return EXIT_FAILURE;
+    }
+
+    MPI_Comm_size( MPI_COMM_WORLD, &clustnbr );
+    MPI_Comm_rank( MPI_COMM_WORLD, &clustnum );
+
+    if ( original.flttype == SpmPattern ) {
+        spmGenFakeValues( &original );
+    }
+
+    spmPrintInfo( &original, stdout );
+
+    if ( clustnum == 0 ) {
+        printf(" -- SPM Matrix-Vector Test --\n");
+    }
+
+    for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) {
+
+        distbycol = (fmttype == SpmCSR) ? 0 : 1;
+        if ( spmConvert( fmttype, &original ) != SPM_SUCCESS ) {
+            fprintf( stderr, "Issue to convert to %s format\n", fmtnames[fmttype] );
+            continue;
+        }
+
+        origdist = spmScatter( &original, -1, NULL, distbycol, -1, MPI_COMM_WORLD );
+        if ( origdist == NULL ) {
+            fprintf( stderr, "Failed to scatter the spm\n" );
+            continue;
+        }
+
+        for( dof=-1; dof<2; dof++ )
+        {
+            if ( dof >= 0 ) {
+                spm = spmDofExtend( origdist, dof, dofmax );
+                to_free = 1;
+            }
+            else {
+                spm = malloc(sizeof(spmatrix_t));
+                memcpy( spm, origdist, sizeof(spmatrix_t) );
+                to_free = 0;
+            }
+
+            if ( spm == NULL ) {
+                fprintf( stderr, "Issue to extend the matrix\n" );
+                continue;
+            }
+
+            for( baseval=0; baseval<2; baseval++ )
+            {
+                spmBase( spm, baseval );
+
+                for( mtxtype=SpmGeneral; mtxtype<=SpmHermitian; mtxtype++ )
+                {
+                    if ( (mtxtype == SpmHermitian) &&
+                         ( ((original.flttype != SpmComplex64) &&
+                            (original.flttype != SpmComplex32)) ||
+                           (original.mtxtype != SpmHermitian) ) )
+                    {
+                        continue;
+                    }
+
+                    if ( (mtxtype != SpmGeneral) &&
+                         (original.mtxtype == SpmGeneral) )
+                    {
+                        continue;
+                    }
+
+                    spm->mtxtype = mtxtype;
+
+                    for( trans=SpmNoTrans; trans<=SpmConjTrans; trans++ )
+                    {
+                        if ( (trans == SpmConjTrans) &&
+                             ((spm->flttype != SpmComplex64) && (spm->flttype != SpmComplex32)))
+                        {
+                            continue;
+                        }
+
+                        if(clustnum == 0) {
+                            printf( " Case: %s / %s / dof(%8s) / base(%d) / %10s / %9s : ",
+                                    fltnames[spm->flttype],
+                                    fmtnames[spm->fmttype],
+                                    dofname[dof+1],
+                                    baseval,
+                                    mtxnames[mtxtype - SpmGeneral],
+                                    transnames[trans - SpmNoTrans] );
+                        }
+
+                        switch( spm->flttype ){
+                        case SpmComplex64:
+                            rc = z_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmComplex32:
+                            rc = c_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmFloat:
+                            rc = s_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmDouble:
+                        default:
+                            rc = d_spm_dist_matvec_check( baseval, trans, spm );
+                        }
+                        err = (rc != 0) ? err+1 : err;
+                    }
+                }
+            }
+
+            if ( spm != origdist ) {
+                if( to_free ){
+                    spmExit( spm  );
+                }
+                free( spm );
+            }
+        }
+
+        spmExit( origdist );
+        free( origdist );
+    }
+
+    spmExit(&original);
+
+    MPI_Finalize();
+
+    if( err == 0 ) {
+        if(clustnum == 0) {
+            printf(" -- All tests PASSED --\n");
+        }
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        printf(" -- %d tests FAILED --\n", err);
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tests/spm_dist_norm_tests.c b/tests/spm_dist_norm_tests.c
index 8c7c7dd3..1369f92e 100644
--- a/tests/spm_dist_norm_tests.c
+++ b/tests/spm_dist_norm_tests.c
@@ -67,7 +67,9 @@ int main (int argc, char **argv)
 
     spmPrintInfo( &original, stdout );
 
-    printf(" -- SPM Norms Test --\n");
+    if ( clustnum == 0 ) {
+        printf(" -- SPM Norms Test --\n");
+    }
 
     for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) {
 
@@ -128,7 +130,7 @@ int main (int argc, char **argv)
                         }
                         spmdist->mtxtype = mtxtype;
 
-                        if(clustnum == 0) {
+                        if ( clustnum == 0 ) {
                             printf( " Case: %s / %s / %d / %s / %d\n",
                                     fltnames[spmdist->flttype],
                                     fmtnames[spmdist->fmttype], baseval,
diff --git a/tests/spm_tests.h b/tests/spm_tests.h
index 7c6a2e49..85dd17ab 100644
--- a/tests/spm_tests.h
+++ b/tests/spm_tests.h
@@ -85,6 +85,7 @@ int  z_spm_norm_check( const spmatrix_t *spm );
 int  z_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  z_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const spm_complex64_t *bloc, const spm_complex64_t *bdst, int root );
+int  z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void c_spm_print_check( char *filename, const spmatrix_t *spm );
 int  c_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -92,6 +93,7 @@ int  c_spm_norm_check( const spmatrix_t *spm );
 int  c_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  c_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const spm_complex32_t *bloc, const spm_complex32_t *bdst, int root );
+int  c_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void d_spm_print_check( char *filename, const spmatrix_t *spm );
 int  d_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -99,6 +101,7 @@ int  d_spm_norm_check( const spmatrix_t *spm );
 int  d_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  d_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const double *bloc, const double *bdst, int root );
+int  d_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void s_spm_print_check( char *filename, const spmatrix_t *spm );
 int  s_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -106,6 +109,7 @@ int  s_spm_norm_check( const spmatrix_t *spm );
 int  s_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  s_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const float *bloc, const float *bdst, int root );
+int  s_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void p_spm_print_check( char *filename, const spmatrix_t *spm );
 
diff --git a/tests/z_spm_tests.c b/tests/z_spm_tests.c
index dcbfebc3..c53a0900 100644
--- a/tests/z_spm_tests.c
+++ b/tests/z_spm_tests.c
@@ -353,7 +353,7 @@ z_spm_dist_genrhs_check( const spmatrix_t      *spm,
     norm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', M, nrhs, bloc, M );
 
     /* Let's gather the distributed RHS */
-    tmp = z_spmReduceRHS( spm, nrhs, bdst, spm->nexp, root );
+    tmp = z_spmGatherRHS( spm, nrhs, bdst, spm->nexp, root );
 
     rc = 0;
     if ( tmp != NULL ) {
@@ -385,4 +385,116 @@ z_spm_dist_genrhs_check( const spmatrix_t      *spm,
 
     return ret;
 }
+
+/*------------------------------------------------------------------------
+ *  Check the accuracy of the solution
+ */
+int
+z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm )
+{
+    spmatrix_t            *spmloc;
+    unsigned long long int seed  = 35469;
+    unsigned long long int seedx = 24632;
+    unsigned long long int seedy = 73246;
+    spm_complex64_t       *x, *y, *yd;
+    /*
+     * Alpha and beta are complex for cblas, but only the real part is used for
+     * matvec/matmat subroutines
+     */
+    double dalpha = 0.;
+    double dbeta  = 0.;
+
+    double    Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm;
+    double    eps, result;
+    int       rc, info_solution, start = 1;
+    spm_int_t ldl, ldd, nrhs = 1;
+
+    eps = LAPACKE_dlamch_work('e');
+
+    core_dplrnt( 1, 1, &dalpha, 1, 1, start, 0, seed ); start++;
+    core_dplrnt( 1, 1, &dbeta,  1, 1, start, 0, seed ); start++;
+
+    ldd = spm->nexp;
+    ldl = spm->gNexp;
+
+    /* Generate random x and y in distributed */
+    x = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) );
+    z_spmRhsGenRndDist( spm, baseval, 1., nrhs, x, spm->nexp, 1, seedx );
+
+    y = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) );
+    z_spmRhsGenRndDist( spm, baseval, 1., nrhs, y, spm->nexp, 1, seedy );
+
+    /* Compute the distributed sparse matrix-vector product */
+    rc = spmMatMat( trans, nrhs, dalpha, spm, x, ldd, dbeta, y, ldd );
+    if ( rc != SPM_SUCCESS ) {
+        info_solution = 1;
+        goto end;
+    }
+
+    /* Compute matrix norm and gather info */
+    Anorm  = spmNorm( SpmInfNorm, spm );
+    spmloc = spmGather( spm, 0 );
+    yd     = z_spmGatherRHS( spm, nrhs, y, ldd, 0 );
+
+    /* Compute the local sparse matrix-vector product */
+    if ( spm->clustnum == 0 ) {
+        spm_complex64_t *xl, *yl;
+
+       /* Generate xl and yl as x and y locally on 0 */
+        xl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) );
+        z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, xl, spm->nexp, 1, seedx );
+
+        yl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) );
+        z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, yl, spm->nexp, 1, seedy );
+
+        /* Compute the original norms */
+        Xnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, xl, ldl );
+        Ynorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl );
+
+        rc = spmMatMat( trans, nrhs, dalpha, spmloc, xl, ldl, dbeta, yl, ldl );
+        if ( rc != SPM_SUCCESS ) {
+            info_solution = 1;
+            goto end;
+        }
+
+        /* Compute the final norm in shared memory */
+        Ylnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl );
+        Ydnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yd, ldl );
+
+        core_zgeadd( SpmNoTrans, ldl, nrhs,
+                     -1., yl, ldl,
+                      1., yd, ldl );
+        Rnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', ldl, nrhs, yd, ldl );
+
+
+        result = Rnorm / ( (Anorm + Xnorm + Ynorm) * spm->gNexp * eps );
+        if (  isinf(Ydnorm) || isinf(Ylnorm) ||
+              isnan(result) || isinf(result) || (result > 10.0) )
+        {
+            info_solution = 1;
+            printf( "FAILED !\n" );
+                    /* "  ||A||_inf = %e, ||x||_inf = %e, ||y||_inf = %e\n" */
+                    /* "  ||shm(a*A*x+b*y)||_inf = %e, ||dist(a*A*x+b*y)||_inf = %e, ||R||_m = %e\n", */
+                    /* Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm ); */
+        }
+        else {
+            info_solution = 0;
+            printf("SUCCESS !\n");
+        }
+
+        free( xl );
+        free( yl );
+        free( yd );
+        spmExit( spmloc );
+        free( spmloc );
+    }
+
+    MPI_Bcast( &info_solution, 1, MPI_INT, 0, spm->comm );
+
+  end:
+    free( x );
+    free( y );
+
+    return info_solution;
+}
 #endif
-- 
GitLab