diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f897fcdd6718df2df7fb1caef63200e90c20644..fd7753a578be36e5de3e01ee56c77e9759605d27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -196,6 +196,7 @@ set(SOURCES
   src/z_spm_dof_extend.c
   src/z_spm_norm.c
   src/z_spm_scal.c
+  src/z_spm_gather_rhs.c
   src/z_spm_reduce_rhs.c
   src/z_spm_convert_to_csc.c
   src/z_spm_convert_to_csr.c
diff --git a/include/spm.h b/include/spm.h
index 0f46e18fe5ad5b21c348a7b901521eacc578eb53..ed42fa9f644dbc1beb2a09a63c03262e7d7790c9 100644
--- a/include/spm.h
+++ b/include/spm.h
@@ -207,7 +207,7 @@ int spmParseLaplacianInfo( const char *    filename,
  */
 void *      spm2Dense( const spmatrix_t *spm );
 void        spmPrint( const spmatrix_t *spm, FILE *f );
-void        spmPrintRHS( const spmatrix_t *spm, int n, const void *x, spm_int_t ldx, FILE *stream );
+void        spmPrintRHS( const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx, FILE *stream );
 void        spmPrintInfo( const spmatrix_t *spm, FILE *f );
 void        spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out );
 spmatrix_t *spmDofExtend( const spmatrix_t *spm, const int type, const int dof );
diff --git a/include/spm/const.h b/include/spm/const.h
index 01f9de534e2cc089ad918008cc6f264b52115e4b..16815f39fef98ac83f366cd84d19d300b6b69897 100644
--- a/include/spm/const.h
+++ b/include/spm/const.h
@@ -32,6 +32,12 @@ BEGIN_C_DECLS
 #define CBLAS_SADDR( a_ ) (&(a_))
 #endif
 
+/**
+ * @brief Distribution of the matrix storage
+ */
+#define SpmDistByColumn (0x1 << 0) /**< Storage in column distributed */
+#define SpmDistByRow    (0x1 << 1) /**< Storage in row distributed    */
+
 /**
  * @brief Verbose modes
  */
diff --git a/src/common.h b/src/common.h
index 4abadf79404831a5b937fe2bdc12b88c20345196..bce115b64f362c01230a373e47fc5caab30c30f3 100644
--- a/src/common.h
+++ b/src/common.h
@@ -54,6 +54,7 @@ spm_get_datatype( const spmatrix_t *spm )
 #endif
 
 spm_int_t *spm_get_glob2loc( spmatrix_t *spm, spm_int_t baseval );
+int        spm_get_distribution( const spmatrix_t *spm );
 
 /********************************************************************
  * Conjuguate/Id functions
diff --git a/src/spm.c b/src/spm.c
index b492e714e77512bd4855c37cc02ef549288872b8..b42538caa3fe60be2fe64bfe19f4876f18cada57 100644
--- a/src/spm.c
+++ b/src/spm.c
@@ -848,6 +848,10 @@ spmCopy( const spmatrix_t *spm )
         newspm->loc2glob = (spm_int_t*)malloc( spm->n * sizeof(spm_int_t) );
         memcpy( newspm->loc2glob, spm->loc2glob, spm->n * sizeof(spm_int_t) );
     }
+    if(spm->glob2loc != NULL) {
+        newspm->glob2loc = (spm_int_t*)malloc( spm->gN * sizeof(spm_int_t) );
+        memcpy( newspm->glob2loc, spm->glob2loc, spm->gN * sizeof(spm_int_t) );
+    }
     if(spm->dofs != NULL) {
         newspm->dofs = (spm_int_t*)malloc( dofsize * sizeof(spm_int_t) );
         memcpy( newspm->dofs, spm->dofs, dofsize * sizeof(spm_int_t) );
@@ -1018,7 +1022,7 @@ spmPrint( const spmatrix_t *spm,
  *******************************************************************************/
 void
 spmPrintRHS( const spmatrix_t *spm,
-             int               n,
+             int               nrhs,
              const void       *x,
              spm_int_t         ldx,
              FILE             *stream )
@@ -1033,16 +1037,16 @@ spmPrintRHS( const spmatrix_t *spm,
         /* Not handled for now */
         break;
     case SpmFloat:
-        s_spmPrintRHS( stream, spm, n, x, ldx );
+        s_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmComplex32:
-        c_spmPrintRHS( stream, spm, n, x, ldx );
+        c_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmComplex64:
-        z_spmPrintRHS( stream, spm, n, x, ldx );
+        z_spmPrintRHS( stream, spm, nrhs, x, ldx );
         break;
     case SpmDouble:
-        d_spmPrintRHS( stream, spm, n, x, ldx );
+        d_spmPrintRHS( stream, spm, nrhs, x, ldx );
     }
 
     return;
@@ -1240,10 +1244,6 @@ spmMatMat(       spm_trans_t trans,
     spmatrix_t *espm = (spmatrix_t*)A;
     int rc = SPM_SUCCESS;
 
-    if ( A->dof != 1 ) {
-        espm = malloc( sizeof(spmatrix_t) );
-        spmExpand( A, espm );
-    }
     switch (A->flttype) {
     case SpmFloat:
         rc = spm_sspmm( SpmLeft, trans, SpmNoTrans, n, alpha, espm, B, ldb, beta, C, ldc );
@@ -1614,3 +1614,76 @@ spm_get_glob2loc( spmatrix_t *spm,
     (void) baseval;
     return spm->glob2loc;
 }
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup spm_mpi_dev
+ *
+ * @brief Search the distribution pattern used in the spm structure.
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The sparse matrix structure.
+ *
+ ********************************************************************************
+ *
+ * @return  SpmDistByColumn if the distribution is column based.
+ *          SpmDistByRow if the distribution is row based.
+ *          (SpmDistByColumn|SpmDistByRow) if the matrix is not distributed.
+ *
+ *******************************************************************************/
+int
+spm_get_distribution( const spmatrix_t *spm )
+{
+    int distribution = 0;
+
+    if( (spm->loc2glob == NULL) || (spm->n == spm->gN) ) {
+        distribution = ( SpmDistByColumn | SpmDistByRow );
+    }
+    else {
+        if( spm->fmttype == SpmCSC ){
+            distribution = SpmDistByColumn;
+        }
+        else if ( spm->fmttype == SpmCSR ) {
+            distribution = SpmDistByRow;
+        }
+        else {
+            spm_int_t  i, baseval;
+            spm_int_t *colptr   = spm->colptr;
+            spm_int_t *glob2loc = spm->glob2loc;
+
+            baseval = spmFindBase( spm );
+            distribution = 1;
+            assert( glob2loc != NULL );
+            for ( i = 0; i < spm->nnz; i++, colptr++ )
+            {
+                /*
+                * If the global index is not in the local colptr
+                * -> row distribution
+                */
+                if( glob2loc[ *colptr - baseval  ] < 0 ) {
+                    distribution = SpmDistByRow;
+                    break;
+                }
+            }
+
+    #if defined(SPM_WITH_MPI)
+            {
+                int check = 0;
+                MPI_Allreduce( &distribution, &check, 1, MPI_INT,
+                               MPI_BOR, spm->comm );
+                /*
+                 * If a matrix is distributed
+                 * it cannot be distributed by row AND column
+                 */
+                assert( check != ( SpmDistByColumn | SpmDistByRow ) );
+                assert( distribution == check );
+            }
+    #endif
+        }
+    }
+    assert(distribution > 0);
+    return distribution;
+}
diff --git a/src/z_spm.h b/src/z_spm.h
index 4fbe1b06122721419c0bff646b3df1ccce6f075b..cb2089c27e16b63d466b673de7d26f851aa0f166 100644
--- a/src/z_spm.h
+++ b/src/z_spm.h
@@ -75,17 +75,28 @@ spm_int_t z_spmSymmetrize( spmatrix_t *spm );
 
 int              z_spmGenRHS(spm_rhstype_t type, int nrhs, const spmatrix_t *spm, void *x, int ldx, void *b, int ldb );
 int              z_spmCheckAxb( spm_fixdbl_t eps, int nrhs, const spmatrix_t *spm, void *x0, int ldx0, void *b, int ldb, const void *x, int ldx );
-spm_complex64_t *z_spmReduceRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root );
+spm_complex64_t *z_spmGatherRHS( const spmatrix_t *spm, int nrhs, const spm_complex64_t *x, spm_int_t ldx, int root );
+void             z_spmReduceRHS( const spmatrix_t *spm, int nrhs, spm_complex64_t *bglob, spm_int_t ldbglob, spm_complex64_t *b, spm_int_t ldb );
 
 /**
  * Output routines
  */
 void z_spmDensePrint( FILE *f, spm_int_t m, spm_int_t n, const spm_complex64_t *A, spm_int_t lda );
 void z_spmPrint( FILE *f, const spmatrix_t *spm );
-void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int n, const void *x, spm_int_t ldx );
+void z_spmPrintRHS( FILE *f, const spmatrix_t *spm, int nrhs, const void *x, spm_int_t ldx );
 
 void z_spmExpand( const spmatrix_t *spm_in, spmatrix_t *spm_out );
 void z_spmDofExtend( spmatrix_t *spm );
 void z_spmScal( const double alpha, spmatrix_t *spm );
 
+/**
+ * Subroutines for random vector generation to be used in testings
+ */
+void z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale,
+                        spm_int_t n, spm_complex64_t *A, spm_int_t lda,
+                        int shift, unsigned long long int seed );
+void z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval, spm_complex64_t scale,
+                         spm_int_t n, spm_complex64_t *A, spm_int_t lda,
+                         int shift, unsigned long long int seed );
+
 #endif /* _z_spm_h_ */
diff --git a/src/z_spm_convert_to_csc.c b/src/z_spm_convert_to_csc.c
index 9ea0d28eeb401afd4798df3d5af822c289949daf..d4efbc3f1cd0a16be77e683010234cf5201a03c4 100644
--- a/src/z_spm_convert_to_csc.c
+++ b/src/z_spm_convert_to_csc.c
@@ -65,30 +65,13 @@ z_spmConvertIJV2CSC( spmatrix_t *spm )
 
 #if defined(SPM_WITH_MPI)
     if ( spm->loc2glob != NULL ) {
-        /*
-         * Check if the distribution is by column or row by exploiting the fact
-         * that the array is sorted.
-         * This is not completely safe, but that avoids going through the full
-         * matrix.
-         */
         const spm_int_t *glob2loc;
-        spm_int_t m = spm->rowptr[spm->nnz-1] - spm->rowptr[0] + 1; /* This may be not correct */
-        spm_int_t n = spm->colptr[spm->nnz-1] - spm->colptr[0] + 1;
         spm_int_t jg;
-        int distribution = 0;
+        int distribution = spm_get_distribution( spm );
 
-        if ( m <= spm->n ) { /* By row */
-            distribution |= 1;
-        }
-        if ( n <= spm->n ) { /* By column */
-            distribution |= 2;
-        }
-        MPI_Allreduce( MPI_IN_PLACE, &distribution, 1, MPI_INT,
-                       MPI_BAND, spm->comm );
-
-        if ( !(distribution & 2) ) {
-            //fprintf( stderr, "spmConvert: Conversion of column distributed matrices to CSC is not yet implemented\n");
-            return SPM_ERR_NOTIMPLEMENTED;
+        if ( !(distribution & SpmDistByColumn) ) {
+            fprintf( stderr, "spmConvert: Conversion of non column distributed matrices to CSC is not yet implemented\n");
+            return SPM_ERR_BADPARAMETER;
         }
 
         /* Allocate and compute the glob2loc array */
diff --git a/src/z_spm_convert_to_csr.c b/src/z_spm_convert_to_csr.c
index 1f5f390c5d109bd6ce91217778589d6e89da8b14..3d7a247f104ccf4db72a769cc135c7cfdd663db0 100644
--- a/src/z_spm_convert_to_csr.c
+++ b/src/z_spm_convert_to_csr.c
@@ -70,29 +70,12 @@ z_spmConvertIJV2CSR( spmatrix_t *spm )
 
 #if defined(SPM_WITH_MPI)
     if ( spm->loc2glob != NULL ) {
-        /*
-         * Check if the distribution is by column or row by exploiting the fact
-         * that the array is sorted.
-         * This is not completely safe, but that avoids going through the full
-         * matrix.
-         */
         const spm_int_t *glob2loc;
-        spm_int_t m = spm->rowptr[spm->nnz-1] - spm->rowptr[0] + 1; /* This may be not correct */
-        spm_int_t n = spm->colptr[spm->nnz-1] - spm->colptr[0] + 1;
         spm_int_t ig;
-        int distribution = 0;
+        int distribution = spm_get_distribution( spm );
 
-        if ( m <= spm->n ) { /* By row */
-            distribution |= 1;
-        }
-        if ( n <= spm->n ) { /* By column */
-            distribution |= 2;
-        }
-        MPI_Allreduce( MPI_IN_PLACE, &distribution, 1, MPI_INT,
-                       MPI_BAND, spm->comm );
-
-        if ( !(distribution & 1) ) {
-            //fprintf( stderr, "spmConvert: Conversion of column distributed matrices to CSC is not yet implemented\n");
+        if ( !(distribution & SpmDistByRow) ) {
+            fprintf( stderr, "spmConvert: Conversion of non row distributed matrices to CSR is not yet implemented\n");
             return SPM_ERR_NOTIMPLEMENTED;
         }
 
diff --git a/src/z_spm_gather_rhs.c b/src/z_spm_gather_rhs.c
new file mode 100644
index 0000000000000000000000000000000000000000..ca4b44b1c12116e6d6645a2abf45fc75106a494e
--- /dev/null
+++ b/src/z_spm_gather_rhs.c
@@ -0,0 +1,166 @@
+/**
+ *
+ * @file z_spm_gather_rhs.c
+ *
+ * SParse Matrix package right hand side gather routine.
+ *
+ * @copyright 2020-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @version 1.0.0
+ * @author Delarue Tony
+ * @author Faverge Mathieu
+ * @date 2020-06-11
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "common.h"
+#include "z_spm.h"
+
+/**
+ *******************************************************************************
+ *
+ * @brief Gather all the global C coefficients and store the good ones in local
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The sparse matrix spm
+ *
+ * @param[in] nrhs
+ *          Number of rhs vectors.
+ *
+ * @param[in] x
+ *          Local vector
+ *
+ * @param[in] ldx
+ *          Leading dimension of this vector
+ *
+ * @param[in] root
+ *          Clustnum whre the complete vector will be gathered.
+ *          -1 if you want to gather the data on all nodes.
+ *
+ ********************************************************************************
+ *
+ * @return The gathered right hand side vector.
+ *
+ *******************************************************************************/
+spm_complex64_t *
+z_spmGatherRHS( const spmatrix_t      *spm,
+                int                    nrhs,
+                const spm_complex64_t *x,
+                spm_int_t              ldx,
+                int                    root )
+{
+    int local = ( ( root == -1 ) || (root == spm->clustnum) );
+    spm_complex64_t *out = NULL;
+
+    /* We do not handle cases where ldx is different from spm->n */
+    assert( ldx == spm->nexp );
+
+    if ( spm->loc2glob == NULL ) {
+        if ( local ) {
+            out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+            memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+        }
+        (void)ldx;
+        return out;
+    }
+
+#if defined(SPM_WITH_MPI)
+    int              c;
+    int              n, nmax = 0;
+    int              nexp, nexpmax = 0;
+    spm_complex64_t *tmp_out, *current_out, *outptr;
+    spm_int_t       *tmp_l2g, *current_l2g;
+    spm_int_t        current_n, current_nexp;
+    spm_int_t        i, j, k, ig, dofi, row;
+    spm_int_t        baseval = spmFindBase( spm );
+
+    n    = spm->n;
+    nexp = spm->nexp;
+    if ( root == -1 ) {
+        MPI_Allreduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, spm->comm );
+        MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm );
+    }
+    else {
+        MPI_Reduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, root, spm->comm );
+        MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm );
+    }
+
+    if ( local ) {
+        out     = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
+        tmp_out = malloc( nexpmax    * nrhs * sizeof( spm_complex64_t ) );
+        tmp_l2g = malloc( nmax              * sizeof( spm_int_t )       );
+    }
+
+    for( c=0; c<spm->clustnbr; c++ ) {
+        MPI_Status status;
+
+        if ( c == spm->clustnum ) {
+            current_n    = spm->n;
+            current_nexp = spm->nexp;
+            current_l2g  = spm->loc2glob;
+            current_out  = (spm_complex64_t*)x;
+        }
+        else {
+            current_n    = 0;
+            current_nexp = 0;
+            current_l2g  = tmp_l2g;
+            current_out  = tmp_out;
+        }
+
+        if ( root == -1 ) {
+            MPI_Bcast( &current_n,    1, SPM_MPI_INT, c, spm->comm );
+            MPI_Bcast( &current_nexp, 1, SPM_MPI_INT, c, spm->comm );
+            if ( current_n > 0 ) {
+                MPI_Bcast( current_l2g, current_n,           SPM_MPI_INT,       c, spm->comm );
+                MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm );
+            }
+        }
+        else if ( root != c ) { /* No communication if c == root */
+            if ( c == spm->clustnum ) {
+                MPI_Send( &current_n,    1, SPM_MPI_INT, root, 0, spm->comm );
+                MPI_Send( &current_nexp, 1, SPM_MPI_INT, root, 1, spm->comm );
+                if ( current_n > 0 ) {
+                    MPI_Send( current_l2g, current_n,           SPM_MPI_INT,       root, 2, spm->comm );
+                    MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm );
+                }
+            }
+            if ( root == spm->clustnum ) {
+                MPI_Recv( &current_n,    1, SPM_MPI_INT, c, 0, spm->comm, &status );
+                MPI_Recv( &current_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status );
+                if ( current_n > 0 ) {
+                    MPI_Recv( current_l2g, current_n,           SPM_MPI_INT,       c, 2, spm->comm, &status );
+                    MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status );
+                }
+            }
+        }
+
+        if ( !local ) {
+            continue;
+        }
+
+        outptr = out;
+        for( i=0; i<current_n; i++, current_l2g++ ) {
+            ig   = *current_l2g - baseval;
+            dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+            row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
+            for( j=0; j<nrhs; j++ ) {
+                for( k=0; k<dofi; k++ ) {
+                    outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ];
+                }
+            }
+            current_out += dofi;
+        }
+    }
+
+    if ( local ) {
+        free( tmp_out );
+        free( tmp_l2g );
+    }
+
+#endif
+    return out;
+}
diff --git a/src/z_spm_genrhs.c b/src/z_spm_genrhs.c
index f466b8bef72ff76945146d70a3d89cb370954f68..54f213cb71f700a12da9b7ba9d40347d71d01c36 100644
--- a/src/z_spm_genrhs.c
+++ b/src/z_spm_genrhs.c
@@ -94,7 +94,7 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) {
  *
  ******************************************************************************/
 static inline void
-z_updateRndVal( double                  scale,
+z_updateRndVal( spm_complex64_t         scale,
                 spm_complex64_t        *val,
                 unsigned long long int *ran )
 {
@@ -265,7 +265,7 @@ z_spmRhsGenI( const spmatrix_t *spm, spm_int_t baseval,
  *         all tiles initialized with this routine.
  *
  ******************************************************************************/
-static inline void
+void
 z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval,
                    spm_complex64_t scale,
                    spm_int_t n, spm_complex64_t *A, spm_int_t lda,
@@ -329,7 +329,7 @@ z_spmRhsGenRndShm( const spmatrix_t *spm, spm_int_t baseval,
  ******************************************************************************/
 void
 z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval,
-                    double scale,
+                    spm_complex64_t scale,
                     spm_int_t n, spm_complex64_t *A, spm_int_t lda,
                     int shift, unsigned long long int seed )
 {
@@ -368,6 +368,7 @@ z_spmRhsGenRndDist( const spmatrix_t *spm, spm_int_t baseval,
             }
         }
     }
+    (void)lda;
 }
 
 /**
diff --git a/src/z_spm_matrixvector.c b/src/z_spm_matrixvector.c
index cf6078b08780acbb26adaf8c770896aa8664fee2..32c766ea94f6014582099a72f8772aeabe5b7333 100644
--- a/src/z_spm_matrixvector.c
+++ b/src/z_spm_matrixvector.c
@@ -37,15 +37,23 @@ __fct_conj( spm_complex64_t val ) {
 }
 #endif
 
+/**
+ * @brief Store all the data necessary to do a matrix-matrix product
+ *        for all cases.
+ */
 struct __spm_zmatvec_s {
     int                    follow_x;
 
-    spm_int_t              baseval, n, nnz;
+    spm_int_t              baseval, n, nnz, gN;
 
     spm_complex64_t        alpha;
     const spm_int_t       *rowptr;
     const spm_int_t       *colptr;
     const spm_complex64_t *values;
+    const spm_int_t       *loc2glob;
+
+    spm_int_t              dof;
+    const spm_int_t       *dofs;
 
     const spm_complex64_t *x;
     spm_int_t              incx;
@@ -58,43 +66,77 @@ struct __spm_zmatvec_s {
     __loop_fct_t           loop_fct;
 };
 
-static inline int
-__spm_zmatvec_sy_csr( const __spm_zmatvec_t *args )
+/**
+ * @brief Compute the dof loop for a general block
+ */
+static inline void
+__spm_zmatvec_dof_loop(       spm_int_t        row, spm_int_t dofi,
+                              spm_int_t        col, spm_int_t dofj,
+                              spm_complex64_t *y,   spm_int_t incy,
+                        const spm_complex64_t *x,   spm_int_t incx,
+                        const spm_complex64_t *values,
+                        const __conj_fct_t     conjA_fct,
+                              spm_complex64_t  alpha )
 {
-    spm_int_t              baseval    = args->baseval;
-    spm_int_t              n          = args->n;
-    spm_complex64_t        alpha      = args->alpha;
-    const spm_int_t       *rowptr     = args->rowptr;
-    const spm_int_t       *colptr     = args->colptr;
-    const spm_complex64_t *values     = args->values;
-    const spm_complex64_t *x          = args->x;
-    spm_int_t              incx       = args->incx;
-    spm_complex64_t       *y          = args->y;
-    spm_int_t              incy       = args->incy;
-    const __conj_fct_t     conjA_fct  = args->conjA_fct;
-    const __conj_fct_t     conjAt_fct = args->conjAt_fct;
-    spm_int_t              col, row, i;
+    spm_int_t ii, jj;
 
-    for( col=0; col<n; col++, colptr++ )
+    for(jj=0; jj<dofj; jj++)
     {
-        for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+        for(ii=0; ii<dofi; ii++, values++)
         {
-            row = *rowptr - baseval;
+            y[ row + (ii * incy) ] += alpha * conjA_fct( *values ) * x[ col +(jj * incx) ];
+        }
+    }
+}
 
-            if ( row != col ) {
-                y[ row * incy ] += alpha * conjAt_fct( *values ) * x[ col * incx ];
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
-            }
-            else {
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
-            }
+/**
+ * @brief Compute the dof loop for a symmetric off diagonal block
+ */
+static inline void
+__spm_zmatvec_dof_loop_sy(       spm_int_t        row, spm_int_t dofi,
+                                 spm_int_t        col, spm_int_t dofj,
+                                 spm_complex64_t *y,   spm_int_t incy,
+                           const spm_complex64_t *x,   spm_int_t incx,
+                           const spm_complex64_t *values,
+                           const __conj_fct_t     conjA_fct,
+                           const __conj_fct_t     conjAt_fct,
+                                 spm_complex64_t  alpha )
+{
+    spm_int_t ii, jj;
+
+    for(jj=0; jj<dofj; jj++)
+    {
+        for(ii=0; ii<dofi; ii++, values++)
+        {
+            y[ row + (ii * incy) ] += alpha * conjA_fct( *values )  * x[ col +(jj * incx) ];
+            y[ col + (jj * incy) ] += alpha * conjAt_fct( *values ) * x[ row +(ii * incx) ];
         }
     }
-    return SPM_SUCCESS;
 }
 
+/**
+ * @brief Compute the dof loop for a symmetric CSR matrix
+ *        Allow code factorization.
+ */
+static inline void
+__spm_zmatvec_dof_loop_sy_csr(       spm_int_t        row, spm_int_t dofi,
+                                     spm_int_t        col, spm_int_t dofj,
+                                     spm_complex64_t *y,   spm_int_t incy,
+                               const spm_complex64_t *x,   spm_int_t incx,
+                               const spm_complex64_t *values,
+                               const __conj_fct_t     conjA_fct,
+                               const __conj_fct_t     conjAt_fct,
+                                     spm_complex64_t  alpha )
+{
+    return __spm_zmatvec_dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjAt_fct, conjA_fct, alpha );
+}
+
+/**
+ * @brief Compute A*x[i:, j] = y[i:, j]
+ *        for a CSX symmetric matrix
+ */
 static inline int
-__spm_zmatvec_sy_csc( const __spm_zmatvec_t *args )
+__spm_zmatvec_sy_csx( const __spm_zmatvec_t *args )
 {
     spm_int_t              baseval    = args->baseval;
     spm_int_t              n          = args->n;
@@ -102,34 +144,53 @@ __spm_zmatvec_sy_csc( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr     = args->rowptr;
     const spm_int_t       *colptr     = args->colptr;
     const spm_complex64_t *values     = args->values;
+    const spm_int_t       *loc2glob   = args->loc2glob;
+    const spm_int_t       *dofs       = args->dofs;
+    spm_int_t              dof        = args->dof;
     const spm_complex64_t *x          = args->x;
     spm_int_t              incx       = args->incx;
     spm_complex64_t       *y          = args->y;
     spm_int_t              incy       = args->incy;
     const __conj_fct_t     conjA_fct  = args->conjA_fct;
     const __conj_fct_t     conjAt_fct = args->conjAt_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              row, col, dofj, dofi;
+    spm_int_t              i, ig, j, jg;
 
-    for( col=0; col<n; col++, colptr++ )
+    /* If(args->follow_x) -> CSR. We need to change exchange the conj functions in the symmetric dof loop */
+    void (*dof_loop_sy)( spm_int_t, spm_int_t, spm_int_t, spm_int_t,
+                         spm_complex64_t *, spm_int_t,
+                         const spm_complex64_t *, spm_int_t, const spm_complex64_t *,
+                         const __conj_fct_t, const __conj_fct_t, spm_complex64_t )
+                        = ( args->follow_x ) ? __spm_zmatvec_dof_loop_sy_csr : __spm_zmatvec_dof_loop_sy;
+
+    for( j=0; j<n; j++, colptr++ )
     {
-        for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+        jg   = (loc2glob == NULL) ? j : loc2glob[j] - baseval ;
+        dofj = ( dof > 0 ) ? dof      : dofs[jg+1] - dofs[jg];
+        col  = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval;
+        for( i=colptr[0]; i<colptr[1]; i++, rowptr++ )
         {
-            row = *rowptr - baseval;
-
+            ig   = *rowptr - baseval;
+            dofi = ( dof > 0 ) ? dof      : dofs[ig+1] - dofs[ig];
+            row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
             if ( row != col ) {
-                y[ row * incy ] += alpha *  conjA_fct( *values ) * x[ col * incx ];
-                y[ col * incy ] += alpha * conjAt_fct( *values ) * x[ row * incx ];
+                dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, conjAt_fct, alpha );
             }
             else {
-                y[ col * incy ] += alpha *  conjA_fct( *values ) * x[ row * incx ];
+                __spm_zmatvec_dof_loop( col, dofj, row, dofi, y, incy, x, incx, values, conjA_fct, alpha );
             }
+            values += dofi*dofj;
         }
     }
     return SPM_SUCCESS;
 }
 
+/**
+ * @brief Compute A*x[i:, j] = y[i:, j]
+ *        for a CSC/CSR general matrix
+ */
 static inline int
-__spm_zmatvec_ge_csc( const __spm_zmatvec_t *args )
+__spm_zmatvec_ge_csx( const __spm_zmatvec_t *args )
 {
     spm_int_t              baseval   = args->baseval;
     spm_int_t              n         = args->n;
@@ -137,36 +198,56 @@ __spm_zmatvec_ge_csc( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr    = args->rowptr;
     const spm_int_t       *colptr    = args->colptr;
     const spm_complex64_t *values    = args->values;
+    const spm_int_t       *loc2glob  = args->loc2glob;
+    const spm_int_t       *dofs      = args->dofs;
+    spm_int_t              dof       = args->dof;
     const spm_complex64_t *x         = args->x;
     spm_int_t              incx      = args->incx;
     spm_complex64_t       *y         = args->y;
     spm_int_t              incy      = args->incy;
     const __conj_fct_t     conjA_fct = args->conjA_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              row, dofj, dofi;
+    spm_int_t              i, ig, j, jg;
 
     if ( args->follow_x ) {
-        for( col=0; col<n; col++, colptr++, x+=incx )
+        for( j = 0; j < n; j++, colptr++ )
         {
-            for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+            jg   = (loc2glob == NULL) ? j : loc2glob[j] - baseval;
+            dofj = ( dof > 0 ) ?      dof : dofs[jg+1] - dofs[jg];
+            for( i=colptr[0]; i<colptr[1]; i++, rowptr++ )
             {
-                row = *rowptr - baseval;
-                y[ row * incy ] += alpha * conjA_fct( *values ) * (*x);
+                ig   = *rowptr - baseval;
+                dofi = ( dof > 0 ) ? dof      : dofs[ig+1] - dofs[ig];
+                row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+                __spm_zmatvec_dof_loop( row, dofi, 0, dofj, y, incy, x, 1, values, conjA_fct, alpha );
+                values += dofi * dofj;
             }
+            x += dofj * incx;
         }
     }
     else {
-        for( col=0; col<n; col++, colptr++, y+=incy )
+        for( j=0; j<n; j++, colptr++ )
         {
-            for( i=colptr[0]; i<colptr[1]; i++, rowptr++, values++ )
+            jg   = (loc2glob == NULL) ? j : loc2glob[j] - baseval;
+            dofj = ( dof > 0 ) ?      dof : dofs[jg+1] - dofs[jg];
+            for( i=colptr[0]; i<colptr[1]; i++, rowptr++ )
             {
-                row = *rowptr - baseval;
-                *y += alpha * conjA_fct( *values ) * x[ row * incx ];
+                ig   = *rowptr - baseval;
+                dofi = ( dof > 0 ) ? dof      : dofs[ig+1] - dofs[ig];
+                row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+                __spm_zmatvec_dof_loop( 0, dofj, row, dofi, y, 1, x, incx, values, conjA_fct, alpha );
+                values += dofi * dofj;
             }
+            y += dofj * incy;
         }
     }
     return SPM_SUCCESS;
 }
 
+/**
+ * @brief Compute A*x[i:, j] = y[i:, j]
+ *        for a IJV symmetric matrix
+ */
 static inline int
 __spm_zmatvec_sy_ijv( const __spm_zmatvec_t *args )
 {
@@ -176,30 +257,68 @@ __spm_zmatvec_sy_ijv( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr     = args->rowptr;
     const spm_int_t       *colptr     = args->colptr;
     const spm_complex64_t *values     = args->values;
+    const spm_int_t       *dofs       = args->dofs;
+    spm_int_t              dof        = args->dof;
     const spm_complex64_t *x          = args->x;
     spm_int_t              incx       = args->incx;
     spm_complex64_t       *y          = args->y;
     spm_int_t              incy       = args->incy;
     const __conj_fct_t     conjA_fct  = args->conjA_fct;
     const __conj_fct_t     conjAt_fct = args->conjAt_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              row, col, dofj, dofi;
+    spm_int_t              i, ig, jg;
 
-    for( i=0; i<nnz; i++, colptr++, rowptr++, values++ )
+    for( i=0; i<nnz; i++, colptr++, rowptr++ )
     {
-        row = *rowptr - baseval;
-        col = *colptr - baseval;
+        ig = *rowptr - baseval;
+        jg = *colptr - baseval;
+
+        dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg];
+        dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig];
+
+        row = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+        col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval;
 
         if ( row != col ) {
-            y[ row * incy ] += alpha *  conjA_fct( *values ) * x[ col * incx ];
-            y[ col * incy ] += alpha * conjAt_fct( *values ) * x[ row * incx ];
+            __spm_zmatvec_dof_loop_sy( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, conjAt_fct, alpha );
         }
         else {
-            y[ row * incy ] += alpha *  conjA_fct( *values ) * x[ col * incx ];
+            __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha );
         }
+        values += dofi*dofj;
     }
     return SPM_SUCCESS;
 }
 
+/**
+ * @brief Build a local dofs array which corresponds
+ *        to the local numerotation of a global index for
+ *        variadic dofs.
+ */
+static inline spm_int_t *
+__spm_zmatvec_dofs_local( const spm_int_t *dofs,
+                          const spm_int_t *glob2loc,
+                          spm_int_t gN)
+{
+    spm_int_t  i, acc = 0;
+    spm_int_t *result, *resptr;
+
+    result = calloc( gN , sizeof(spm_int_t) );
+    resptr = result;
+    for ( i = 0; i < gN; i++, glob2loc++, resptr++ )
+    {
+        if( *glob2loc >= 0 ) {
+            *resptr = acc;
+            acc += dofs[i+1] - dofs[i];
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Compute A*x[i:, j] = y[i:, j]
+ *        for a IJV general matrix
+ */
 static inline int
 __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args )
 {
@@ -209,20 +328,68 @@ __spm_zmatvec_ge_ijv( const __spm_zmatvec_t *args )
     const spm_int_t       *rowptr    = args->rowptr;
     const spm_int_t       *colptr    = args->colptr;
     const spm_complex64_t *values    = args->values;
+    const spm_int_t       *glob2loc  = args->loc2glob;
+    const spm_int_t       *dofs      = args->dofs;
+    spm_int_t              dof       = args->dof;
     const spm_complex64_t *x         = args->x;
     spm_int_t              incx      = args->incx;
     spm_complex64_t       *y         = args->y;
     spm_int_t              incy      = args->incy;
     const __conj_fct_t     conjA_fct = args->conjA_fct;
-    spm_int_t              col, row, i;
+    spm_int_t              row, col, dofj, dofi;
+    spm_int_t              i, ig, jg;
 
-    for( i=0; i<nnz; i++, colptr++, rowptr++, values++ )
-    {
-        row = *rowptr - baseval;
-        col = *colptr - baseval;
+    spm_int_t *dof_local = NULL;
 
-        y[ row * incy ] += alpha * conjA_fct( *values ) * x[ col * incx ];
+    if( (dofs != NULL) && (glob2loc != NULL) ) {
+        dof_local = __spm_zmatvec_dofs_local( dofs, glob2loc, args->gN );
     }
+
+    if( args->follow_x ) {
+        for( i=0; i<nnz; i++, colptr++, rowptr++ )
+        {
+            ig = *rowptr - baseval;
+            jg = *colptr - baseval;
+
+            dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg];
+            dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig];
+
+            row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+            if (glob2loc == NULL) {
+                col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval;
+            }
+            else {
+                col = ( dof > 0 ) ? dof * glob2loc[jg] : dof_local[jg];
+            }
+            __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha );
+            values += dofi*dofj;
+        }
+    }
+    else {
+        for( i=0; i<nnz; i++, colptr++, rowptr++ )
+        {
+            ig = *rowptr - baseval;
+            jg = *colptr - baseval;
+
+            dofj = ( dof > 0 ) ? dof : dofs[jg+1] - dofs[jg];
+            dofi = ( dof > 0 ) ? dof : dofs[ig+1] - dofs[ig];
+
+            col = ( dof > 0 ) ? dof * jg : dofs[jg] - baseval;
+            if ( glob2loc == NULL ) {
+                row  = ( dof > 0 ) ? dof * ig : dofs[ig] - baseval;
+            }
+            else {
+                row = ( dof > 0 ) ? dof * glob2loc[ig] : dof_local[ig];
+            }
+            __spm_zmatvec_dof_loop( row, dofi, col, dofj, y, incy, x, incx, values, conjA_fct, alpha );
+            values += dofi*dofj;
+        }
+    }
+
+    if(dof_local != NULL) {
+        free(dof_local);
+    }
+
     return SPM_SUCCESS;
 }
 
@@ -276,10 +443,14 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
     args->baseval    = spmFindBase( A );
     args->n          = A->n;
     args->nnz        = A->nnz;
+    args->gN         = A->gN;
     args->alpha      = alpha;
     args->rowptr     = A->rowptr;
     args->colptr     = A->colptr;
     args->values     = A->values;
+    args->loc2glob   = A->loc2glob;
+    args->dof        = A->dof;
+    args->dofs       = A->dofs;
     args->x          = B;
     args->incx       = incx;
     args->y          = C;
@@ -312,31 +483,39 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
     case SpmCSC:
     {
         /* Switch pointers and side to get the correct behaviour */
-        if ( ((side == SpmLeft)  && (transA == SpmNoTrans)) ||
-             ((side == SpmRight) && (transA != SpmNoTrans)) )
-        {
-            args->follow_x = 1;
-        }
-        else {
-            args->follow_x = 0;
+        if( A->mtxtype == SpmGeneral ) {
+            if ( ((side == SpmLeft)  && (transA == SpmNoTrans)) ||
+                 ((side == SpmRight) && (transA != SpmNoTrans)) )
+            {
+                args->follow_x = 1;
+            }
+            else {
+                args->follow_x = 0;
+            }
         }
-        args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csc : __spm_zmatvec_sy_csc;
+        args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csx : __spm_zmatvec_sy_csx;
     }
     break;
     case SpmCSR:
     {
         /* Switch pointers and side to get the correct behaviour */
-        if ( ((side == SpmLeft)  && (transA != SpmNoTrans)) ||
-             ((side == SpmRight) && (transA == SpmNoTrans)) )
-        {
-            args->follow_x = 1;
+        if( A->mtxtype == SpmGeneral ) {
+            if ( ((side == SpmLeft)  && (transA != SpmNoTrans)) ||
+                 ((side == SpmRight) && (transA == SpmNoTrans)) )
+            {
+                args->follow_x = 1;
+            }
+            else {
+                args->follow_x = 0;
+            }
         }
         else {
-            args->follow_x = 0;
+            args->follow_x = 1;
         }
+
         args->colptr = A->rowptr;
         args->rowptr = A->colptr;
-        args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csc : __spm_zmatvec_sy_csr;
+        args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_csx : __spm_zmatvec_sy_csx;
     }
     break;
     case SpmIJV:
@@ -349,7 +528,11 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
             args->conjAt_fct = tmp_fct;
             args->colptr = A->rowptr;
             args->rowptr = A->colptr;
+            args->follow_x = 0;
+        } else {
+            args->follow_x = 1;
         }
+        args->loc2glob = A->glob2loc;
         args->loop_fct = (A->mtxtype == SpmGeneral) ? __spm_zmatvec_ge_ijv : __spm_zmatvec_sy_ijv;
     }
     break;
@@ -360,6 +543,105 @@ __spm_zmatvec_args_init( __spm_zmatvec_t       *args,
     return SPM_SUCCESS;
 }
 
+/**
+ *******************************************************************************
+ *
+ * @ingroup spm_dev_matvec
+ *
+ * @brief Build a global C RHS, set to 0 for remote datas.
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The pointer to the sparse matrix structure.
+ *
+ * @param[in] Cloc
+ *          The local C vector.
+ *
+ * @param[inout] ldc
+ *          The leading dimension of the local C vector.
+ *          Will be updated to corresponds to the global one.
+ *
+ * @param[in] nrhs
+ *          The number of RHS.
+ *
+ *******************************************************************************
+ *
+ * @return A global C vector which stores local datas and set remote datas to 0.
+ *
+ *******************************************************************************/
+static inline spm_complex64_t *
+z_spmm_build_Ctmp( const spmatrix_t      *spm,
+                   const spm_complex64_t *Cloc,
+                         spm_int_t       *ldc,
+                         int              nrhs )
+{
+    spm_complex64_t *Ctmp;
+    spm_complex64_t *Cptr = (spm_complex64_t *)Cloc;
+    spm_int_t i, j, idx;
+    spm_int_t ig, dof, baseval, *loc2glob;
+
+    Ctmp = calloc(spm->gNexp * nrhs, sizeof(spm_complex64_t));
+    *ldc = spm->gNexp;
+
+    baseval = spmFindBase(spm);
+    for ( j = 0; j < nrhs; j++ )
+    {
+        loc2glob = spm->loc2glob;
+        for ( i = 0; i < spm->n; i++, loc2glob++ )
+        {
+            ig  = *loc2glob - baseval;
+            dof = (spm->dof > 0) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+            idx = (spm->dof > 0) ? spm->dof * ig : spm->dofs[ig] - baseval;
+            memcpy( (Ctmp + j * spm->gNexp + idx),
+                     Cptr,
+                     dof * sizeof(spm_complex64_t) );
+            Cptr += dof;
+        }
+    }
+    return Ctmp;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup spm_dev_matvec
+ *
+ * @brief Build a global B vector by gathering datas from all nodes.
+ *
+ *******************************************************************************
+ *
+ * @param[in] spm
+ *          The pointer to the sparse matrix structure.
+ *
+ * @param[in] Bloc
+ *          The local B vector.
+ *
+ * @param[inout] ldb
+ *          The leading dimension of the local B vector.
+ *          Will be updated to corresponds to the global one.
+ *
+ * @param[in] nrhs
+ *          The number of RHS.
+ *
+ *******************************************************************************
+ *
+ * @return The gathered Btmp vector.
+ *
+ *******************************************************************************/
+static inline spm_complex64_t *
+z_spmm_build_Btmp( const spmatrix_t      *spm,
+                   const spm_complex64_t *Bloc,
+                         spm_int_t       *ldb,
+                         int              nrhs )
+{
+    spm_complex64_t *Btmp;
+
+    Btmp = z_spmGatherRHS( spm, nrhs, Bloc, *ldb, -1 );
+    *ldb = spm->gNexp;
+    return Btmp;
+}
+
 /**
  *******************************************************************************
  *
@@ -457,8 +739,10 @@ spm_zspmm( spm_side_t             side,
            spm_int_t              ldc )
 {
     int rc = SPM_SUCCESS;
+    int distribution;
     spm_int_t M, N, ldx, ldy, r;
     __spm_zmatvec_t args;
+    spm_complex64_t *Ctmp, *Btmp;
 
     if ( transB != SpmNoTrans ) {
         fprintf(stderr, "transB != SpmNoTrans not supported yet in spmv computations\n");
@@ -467,18 +751,18 @@ spm_zspmm( spm_side_t             side,
     }
 
     if ( side == SpmLeft ) {
-        M = A->n;
+        M = A->nexp;
         N = K;
 
-        ldx  = ldb;
-        ldy  = ldc;
+        ldx = ldb;
+        ldy = ldc;
     }
     else {
         M = K;
-        N = A->n;
+        N = A->nexp;
 
-        ldx  = 1;
-        ldy  = 1;
+        ldx = 1;
+        ldy = 1;
     }
 
     if ( beta == 0. ) {
@@ -492,15 +776,45 @@ spm_zspmm( spm_side_t             side,
         return SPM_SUCCESS;
     }
 
+    Btmp = (spm_complex64_t*)B;
+    Ctmp = C;
+    distribution = spm_get_distribution(A);
+    if ( distribution != ( SpmDistByColumn | SpmDistByRow ) ) {
+
+        if ( A->mtxtype != SpmGeneral ) {
+            Btmp = z_spmm_build_Btmp( A, B, &ldb, N );
+            Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N );
+        }
+        else {
+            if( ( (transA != SpmNoTrans) && (distribution == 1) ) ||
+                ( (transA == SpmNoTrans) && (distribution == 2) ) ) {
+                Btmp = z_spmm_build_Btmp( A, B, &ldb, N );
+            }
+            if( ( (transA == SpmNoTrans) && (distribution == 1) ) ||
+                ( (transA != SpmNoTrans) && (distribution == 2) ) ) {
+                Ctmp = z_spmm_build_Ctmp( A, C, &ldc, N );
+            }
+        }
+    }
+
     __spm_zmatvec_args_init( &args, side, transA,
-                             alpha, A, B, ldb, C, ldc );
+                             alpha, A, Btmp, ldb, Ctmp, ldc );
 
     for( r=0; (r < N) && (rc == SPM_SUCCESS); r++ ) {
-        args.x = B + r * ldx;
-        args.y = C + r * ldy;
+        args.x = Btmp + r * ldx;
+        args.y = Ctmp + r * ldy;
         rc = args.loop_fct( &args );
     }
 
+    if ( Ctmp != C ) {
+        z_spmReduceRHS( A, N, Ctmp, A->gNexp, C, A->nexp );
+        free( Ctmp );
+    }
+
+    if ( Btmp != B ) {
+        free( Btmp );
+    }
+
     return rc;
 }
 
@@ -558,23 +872,54 @@ spm_zspmv( spm_trans_t            trans,
            spm_int_t              incy )
 {
     int rc = SPM_SUCCESS;
+    int distribution;
     __spm_zmatvec_t args;
+    spm_complex64_t *ytmp, *xtmp;
 
     if ( beta == 0. ) {
-        memset( y, 0, A->n * sizeof(spm_complex64_t) );
+        memset( y, 0, A->nexp * sizeof(spm_complex64_t) );
     }
     else {
-        cblas_zscal( A->n, CBLAS_SADDR(beta), y, incy );
+        cblas_zscal( A->nexp, CBLAS_SADDR(beta), y, incy );
     }
 
     if ( alpha == 0. ) {
         return SPM_SUCCESS;
     }
 
-    __spm_zmatvec_args_init( &args, SpmLeft, trans,
-                             alpha, A, x, incx, y, incy );
+    xtmp = (spm_complex64_t*)x;
+    ytmp = y;
+    distribution = spm_get_distribution(A);
+    if ( distribution != ( SpmDistByColumn | SpmDistByRow ) ){
+
+        if ( A->mtxtype != SpmGeneral ) {
+            xtmp = z_spmm_build_Btmp( A, x, &incx, 1 );
+            ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 );
+        }
+        else {
+            if( ( (trans != SpmNoTrans) && (distribution == 1) ) ||
+                ( (trans == SpmNoTrans) && (distribution == 2) ) ) {
+                xtmp = z_spmm_build_Btmp( A, x, &incx, 1 );
+            }
+            if( ( (trans == SpmNoTrans) && (distribution == 1) ) ||
+                ( (trans != SpmNoTrans) && (distribution == 2) ) ) {
+                ytmp = z_spmm_build_Ctmp( A, y, &incy, 1 );
+            }
+        }
+    }
 
+    __spm_zmatvec_args_init( &args, SpmLeft, trans,
+                             alpha, A, xtmp, incx, ytmp, incy );
     rc = args.loop_fct( &args );
 
+    if ( ytmp != y ) {
+        z_spmReduceRHS( A, 1, ytmp, A->gNexp, y, A->nexp );
+        free( ytmp );
+    }
+
+    if ( xtmp != x ) {
+        free( xtmp );
+    }
+
     return rc;
 }
diff --git a/src/z_spm_print.c b/src/z_spm_print.c
index 7ba3c73e5059470899c4840bfc6df03207278ffb..a462f0ac257bd5b580ad933a46ad36bda32ce00e 100644
--- a/src/z_spm_print.c
+++ b/src/z_spm_print.c
@@ -460,15 +460,15 @@ z_spmPrint( FILE *f, const spmatrix_t *spm )
  *******************************************************************************/
 void
 z_spmPrintRHS( FILE *f, const spmatrix_t *spm,
-               int n, const void *x, spm_int_t ldx )
+               int nrhs, const void *x, spm_int_t ldx )
 {
     const spm_complex64_t *xptr = (const spm_complex64_t *)x;
     spm_int_t i, j, ig, baseval;
 
     baseval = spmFindBase( spm );
 
-    for( j=0; j<n; j++) {
-        for( i=0; i<spm->n; i++, xptr++ ) {
+    for( j=0; j<nrhs; j++) {
+        for( i=0; i < spm->nexp; i++, xptr++ ) {
             ig = (spm->loc2glob == NULL) ? i : spm->loc2glob[i] - baseval;
 
             z_spmPrintElt( f, ig, j, *xptr );
diff --git a/src/z_spm_reduce_rhs.c b/src/z_spm_reduce_rhs.c
index 69e96fb3dfd0df5e1cec86b81c2a51500659c993..2a331d97c8a08d9f7dea4e986d56973776b3d28b 100644
--- a/src/z_spm_reduce_rhs.c
+++ b/src/z_spm_reduce_rhs.c
@@ -9,8 +9,7 @@
  *
  * @version 1.0.0
  * @author Delarue Tony
- * @author Faverge Mathieu
- * @date 2020-06-11
+ * @date 2020-03-08
  *
  * @precisions normal z -> c d s
  *
@@ -21,135 +20,65 @@
 /**
  *******************************************************************************
  *
- * @brief Reduce all the global C coefficients and store the good ones in local
+ * @brief Reduce all the global coefficients  of a rhs and store the local ones
  *
  *******************************************************************************
  *
  * @param[in] spm
  *          The sparse matrix spm
  *
- * @param[inout] glob
- *          The global vector to reduce
+ * @param[in] nrhs
+ *          Number of rhs vectors.
  *
- * @param[inout] loc
- *          Local vector
+ * @param[inout] bglob
+ *          The global rhs vector to reduce.
+ *
+ * @param[inout] bloc
+ *          Local rhs vector.
+ *
+ * @param[inout] ldb
+ *          Leading dimension of the global b  vector.
  *
  *******************************************************************************/
-spm_complex64_t *
+void
 z_spmReduceRHS( const spmatrix_t      *spm,
-                int                    nrhs,
-                const spm_complex64_t *x,
-                spm_int_t              ldx,
-                int                    root )
+                      int              nrhs,
+                      spm_complex64_t *bglob,
+                      spm_int_t        ldbglob,
+                      spm_complex64_t *b,
+                      spm_int_t        ldb )
 {
-    spm_complex64_t *outptr, *out = NULL;
-    int              c;
-    int              n, nmax = 0;
-    int              nexp, nexpmax = 0;
-    spm_complex64_t *tmp_out, *current_out;
-    spm_int_t       *tmp_l2g, *current_l2g;
-    spm_int_t        current_n, current_nexp;
-    spm_int_t        i, j, k, ig, dofi, row;
-    int              local =  ( ( root == -1 ) || (root == spm->clustnum) );
-    spm_int_t        baseval = spmFindBase( spm );
-
-    /* We do not handle cases where ldx is different from spm->n */
-    assert( ldx == spm->nexp );
+#if defined(SPM_WITH_MPI)
+    spm_int_t        i, j, k;
+    spm_int_t        ig, dofi, row, baseval;
+    spm_int_t       *loc2glob;
+    spm_complex64_t *rhs = b;
 
     if ( spm->loc2glob == NULL ) {
-        if ( local ) {
-            out = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-            memcpy( out, x, spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-        }
-        return out;
+        return;
     }
 
-#if !defined(SPM_WITH_MPI)
-    assert( 0 );
-#else
-    n    = spm->n;
-    nexp = spm->nexp;
-    if ( root == -1 ) {
-        MPI_Allreduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, spm->comm );
-        MPI_Allreduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, spm->comm );
-    }
-    else {
-        MPI_Reduce( &n,    &nmax,    1, MPI_INT, MPI_MAX, root, spm->comm );
-        MPI_Reduce( &nexp, &nexpmax, 1, MPI_INT, MPI_MAX, root, spm->comm );
-    }
-
-    if ( local ) {
-        out     = malloc( spm->gNexp * nrhs * sizeof( spm_complex64_t ) );
-        tmp_out = malloc( nexpmax    * nrhs * sizeof( spm_complex64_t ) );
-        tmp_l2g = malloc( nmax              * sizeof( spm_int_t )       );
-    }
-
-    for( c=0; c<spm->clustnbr; c++ ) {
-        MPI_Status status;
-
-        if ( c == spm->clustnum ) {
-            current_n    = spm->n;
-            current_nexp = spm->nexp;
-            current_l2g  = spm->loc2glob;
-            current_out  = (spm_complex64_t*)x;
-        }
-        else {
-            current_n    = 0;
-            current_nexp = 0;
-            current_l2g  = tmp_l2g;
-            current_out  = tmp_out;
-        }
+    MPI_Allreduce( MPI_IN_PLACE, bglob, ldbglob * nrhs, SPM_MPI_COMPLEX64, MPI_SUM, spm->comm );
 
-        if ( root == -1 ) {
-            MPI_Bcast( &current_n,    1, SPM_MPI_INT, c, spm->comm );
-            MPI_Bcast( &current_nexp, 1, SPM_MPI_INT, c, spm->comm );
-            if ( current_n > 0 ) {
-                MPI_Bcast( current_l2g, current_n,           SPM_MPI_INT,       c, spm->comm );
-                MPI_Bcast( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, spm->comm );
+    baseval  = spmFindBase( spm );
+    loc2glob = spm->loc2glob;
+    for( i=0; i<spm->n; i++, loc2glob++ ) {
+        ig   = *loc2glob - baseval;
+        dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
+        row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
+        for( j=0; j<nrhs; j++ ) {
+            for( k=0; k<dofi; k++ ) {
+                rhs[ j * ldb + k ] = bglob[ row + j * ldbglob + k ];
             }
         }
-        else if ( root != c ) { /* No communication if c == root */
-            if ( c == spm->clustnum ) {
-                MPI_Send( &current_n,    1, SPM_MPI_INT, root, 0, spm->comm );
-                MPI_Send( &current_nexp, 1, SPM_MPI_INT, root, 1, spm->comm );
-                if ( current_n > 0 ) {
-                    MPI_Send( current_l2g, current_n,           SPM_MPI_INT,       root, 2, spm->comm );
-                    MPI_Send( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, root, 3, spm->comm );
-                }
-            }
-            if ( root == spm->clustnum ) {
-                MPI_Recv( &current_n,    1, SPM_MPI_INT, c, 0, spm->comm, &status );
-                MPI_Recv( &current_nexp, 1, SPM_MPI_INT, c, 1, spm->comm, &status );
-                if ( current_n > 0 ) {
-                    MPI_Recv( current_l2g, current_n,           SPM_MPI_INT,       c, 2, spm->comm, &status );
-                    MPI_Recv( current_out, current_nexp * nrhs, SPM_MPI_COMPLEX64, c, 3, spm->comm, &status );
-                }
-            }
-        }
-
-        if ( !local ) {
-            continue;
-        }
-
-        outptr = out;
-        for( i=0; i<current_n; i++, current_l2g++ ) {
-            ig   = *current_l2g - baseval;
-            dofi = ( spm->dof > 0 ) ? spm->dof : spm->dofs[ig+1] - spm->dofs[ig];
-            row  = ( spm->dof > 0 ) ? spm->dof * ig : spm->dofs[ig] - baseval;
-            for( j=0; j<nrhs; j++ ) {
-                for( k=0; k<dofi; k++ ) {
-                    outptr[ row + j * spm->gNexp + k ] = current_out[ j * current_nexp + k ];
-                }
-            }
-            current_out += dofi;
-        }
-    }
-
-    if ( local ) {
-        free( tmp_out );
-        free( tmp_l2g );
+        rhs += dofi;
     }
-
+#else
+    (void)spm;
+    (void)nrhs;
+    (void)bglob;
+    (void)ldbglob;
+    (void)b;
+    (void)ldb;
 #endif
-    return out;
-}
+}
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 311bff7fd1adae3dd94ea4dede2e4db8d473189d..815e11c4c9d80872f9f773bf36822a3314b6a07b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -53,6 +53,7 @@ if ( SPM_WITH_MPI )
     spm_scatter_gather_tests.c
     spm_dist_norm_tests.c
     spm_dist_genrhs_tests.c
+    spm_dist_matvec_tests.c
     )
 endif()
 
@@ -70,7 +71,9 @@ set( SPM_DOF_TESTS
 set( SPM_MPI_TESTS
   spm_scatter_gather_tests
   spm_dist_norm_tests
-  spm_dist_genrhs_tests )
+  spm_dist_genrhs_tests
+  spm_dist_matvec_tests
+  )
 
 # List of run types
 set( RUNTYPE shm )
diff --git a/tests/spm_compare.c b/tests/spm_compare.c
index d16adad322d211742633ae52c8f427b121da4db7..123c6b071cf9685118cb593233083bf49b5a0243 100644
--- a/tests/spm_compare.c
+++ b/tests/spm_compare.c
@@ -138,7 +138,9 @@ int
 spmCompare( spmatrix_t *spm1,
             spmatrix_t *spm2 )
 {
-    spm_int_t i, base1, base2;
+    spm_int_t i;
+    spm_int_t base1 = -1;
+    spm_int_t base2 = -2;
     int rc = 0;
 
     /* I don't have one of the matrices, I jump to the findbase computations */
diff --git a/tests/spm_dist_genrhs_tests.c b/tests/spm_dist_genrhs_tests.c
index a0fb79f6b2319868748f97640024983d74a94058..26d0c46e15bd70a66b94d585e88ff56b653e5740 100644
--- a/tests/spm_dist_genrhs_tests.c
+++ b/tests/spm_dist_genrhs_tests.c
@@ -149,7 +149,7 @@ int main (int argc, char **argv)
                         printf( "SUCCESS\n" );
                     }
                     else {
-                        printf( "FAILURE\n" );
+                        printf( "FAILED\n" );
                     }
                 }
                 err = (rc != 0) ? err+1 : err;
diff --git a/tests/spm_dist_matvec_tests.c b/tests/spm_dist_matvec_tests.c
new file mode 100644
index 0000000000000000000000000000000000000000..7d2e270bdbe19e0a1f85bed61850c39642ef5b32
--- /dev/null
+++ b/tests/spm_dist_matvec_tests.c
@@ -0,0 +1,204 @@
+/**
+ *
+ * @file spm_dist_matvec_tests.c
+ *
+ * Tests and validate the spm_matvec routines.
+ *
+ * @copyright 2015-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @version 6.0.0
+ * @author Mathieu Faverge
+ * @author Theophile Terraz
+ * @author Tony Delarue
+ * @date 2020-05-19
+ *
+ **/
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include "spm_tests.h"
+
+#if !defined(SPM_WITH_MPI)
+#error "This test should not be compiled in non distributed version"
+#endif
+
+#define PRINT_RES(_ret_)                        \
+    if(_ret_) {                                 \
+        printf("FAILED(%d)\n", _ret_);          \
+        err++;                                  \
+    }                                           \
+    else {                                      \
+        printf("SUCCESS\n");                    \
+    }
+
+int main (int argc, char **argv)
+{
+    char         *filename;
+    spmatrix_t    original, *origdist, *spm;
+    spm_driver_t  driver;
+    int clustnbr = 1;
+    int clustnum = 0;
+    spm_mtxtype_t mtxtype;
+    spm_fmttype_t fmttype;
+    spm_trans_t   trans;
+    int baseval, distbycol;
+    int rc = SPM_SUCCESS;
+    int err = 0;
+    int dof, dofmax = 4;
+    int to_free = 0;
+
+    MPI_Init( &argc, &argv );
+
+    /**
+     * Get options from command line
+     */
+    spmGetOptions( argc, argv,
+                   &driver, &filename );
+
+    rc = spmReadDriver( driver, filename, &original );
+    free(filename);
+
+    if ( rc != SPM_SUCCESS ) {
+        fprintf(stderr, "ERROR: Could not read the file, stop the test !!!\n");
+        return EXIT_FAILURE;
+    }
+
+    MPI_Comm_size( MPI_COMM_WORLD, &clustnbr );
+    MPI_Comm_rank( MPI_COMM_WORLD, &clustnum );
+
+    if ( original.flttype == SpmPattern ) {
+        spmGenFakeValues( &original );
+    }
+
+    spmPrintInfo( &original, stdout );
+
+    if ( clustnum == 0 ) {
+        printf(" -- SPM Matrix-Vector Test --\n");
+    }
+
+    for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) {
+
+        distbycol = (fmttype == SpmCSR) ? 0 : 1;
+        if ( spmConvert( fmttype, &original ) != SPM_SUCCESS ) {
+            fprintf( stderr, "Issue to convert to %s format\n", fmtnames[fmttype] );
+            continue;
+        }
+
+        origdist = spmScatter( &original, -1, NULL, distbycol, -1, MPI_COMM_WORLD );
+        if ( origdist == NULL ) {
+            fprintf( stderr, "Failed to scatter the spm\n" );
+            continue;
+        }
+
+        for( dof=-1; dof<2; dof++ )
+        {
+            if ( dof >= 0 ) {
+                spm = spmDofExtend( origdist, dof, dofmax );
+                to_free = 1;
+            }
+            else {
+                spm = malloc(sizeof(spmatrix_t));
+                memcpy( spm, origdist, sizeof(spmatrix_t) );
+                to_free = 0;
+            }
+
+            if ( spm == NULL ) {
+                fprintf( stderr, "Issue to extend the matrix\n" );
+                continue;
+            }
+
+            for( baseval=0; baseval<2; baseval++ )
+            {
+                spmBase( spm, baseval );
+
+                for( mtxtype=SpmGeneral; mtxtype<=SpmHermitian; mtxtype++ )
+                {
+                    if ( (mtxtype == SpmHermitian) &&
+                         ( ((original.flttype != SpmComplex64) &&
+                            (original.flttype != SpmComplex32)) ||
+                           (original.mtxtype != SpmHermitian) ) )
+                    {
+                        continue;
+                    }
+
+                    if ( (mtxtype != SpmGeneral) &&
+                         (original.mtxtype == SpmGeneral) )
+                    {
+                        continue;
+                    }
+
+                    spm->mtxtype = mtxtype;
+
+                    for( trans=SpmNoTrans; trans<=SpmConjTrans; trans++ )
+                    {
+                        if ( (trans == SpmConjTrans) &&
+                             ((spm->flttype != SpmComplex64) && (spm->flttype != SpmComplex32)))
+                        {
+                            continue;
+                        }
+
+                        if(clustnum == 0) {
+                            printf( " Case: %s / %s / dof(%8s) / base(%d) / %10s / %9s : ",
+                                    fltnames[spm->flttype],
+                                    fmtnames[spm->fmttype],
+                                    dofname[dof+1],
+                                    baseval,
+                                    mtxnames[mtxtype - SpmGeneral],
+                                    transnames[trans - SpmNoTrans] );
+                        }
+
+                        switch( spm->flttype ){
+                        case SpmComplex64:
+                            rc = z_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmComplex32:
+                            rc = c_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmFloat:
+                            rc = s_spm_dist_matvec_check( baseval, trans, spm );
+                            break;
+
+                        case SpmDouble:
+                        default:
+                            rc = d_spm_dist_matvec_check( baseval, trans, spm );
+                        }
+                        err = (rc != 0) ? err+1 : err;
+                    }
+                }
+            }
+
+            if ( spm != origdist ) {
+                if( to_free ){
+                    spmExit( spm  );
+                }
+                free( spm );
+            }
+        }
+
+        spmExit( origdist );
+        free( origdist );
+    }
+
+    spmExit(&original);
+
+    MPI_Finalize();
+
+    if( err == 0 ) {
+        if(clustnum == 0) {
+            printf(" -- All tests PASSED --\n");
+        }
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        printf(" -- %d tests FAILED --\n", err);
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tests/spm_dist_norm_tests.c b/tests/spm_dist_norm_tests.c
index 8c7c7dd36f2fbefdf07874a06784e2eed71439ba..1369f92e1e4ed3d26b56b61aefe1a7860b4b5bcc 100644
--- a/tests/spm_dist_norm_tests.c
+++ b/tests/spm_dist_norm_tests.c
@@ -67,7 +67,9 @@ int main (int argc, char **argv)
 
     spmPrintInfo( &original, stdout );
 
-    printf(" -- SPM Norms Test --\n");
+    if ( clustnum == 0 ) {
+        printf(" -- SPM Norms Test --\n");
+    }
 
     for( fmttype=SpmCSC; fmttype<=SpmIJV; fmttype++ ) {
 
@@ -128,7 +130,7 @@ int main (int argc, char **argv)
                         }
                         spmdist->mtxtype = mtxtype;
 
-                        if(clustnum == 0) {
+                        if ( clustnum == 0 ) {
                             printf( " Case: %s / %s / %d / %s / %d\n",
                                     fltnames[spmdist->flttype],
                                     fmtnames[spmdist->fmttype], baseval,
diff --git a/tests/spm_tests.h b/tests/spm_tests.h
index 7c6a2e496c4585bfb04eb89b51c7c8c5f7fbe006..85dd17ab26f7f2bd503d492d906a5476f3e62ce6 100644
--- a/tests/spm_tests.h
+++ b/tests/spm_tests.h
@@ -85,6 +85,7 @@ int  z_spm_norm_check( const spmatrix_t *spm );
 int  z_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  z_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const spm_complex64_t *bloc, const spm_complex64_t *bdst, int root );
+int  z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void c_spm_print_check( char *filename, const spmatrix_t *spm );
 int  c_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -92,6 +93,7 @@ int  c_spm_norm_check( const spmatrix_t *spm );
 int  c_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  c_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const spm_complex32_t *bloc, const spm_complex32_t *bdst, int root );
+int  c_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void d_spm_print_check( char *filename, const spmatrix_t *spm );
 int  d_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -99,6 +101,7 @@ int  d_spm_norm_check( const spmatrix_t *spm );
 int  d_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  d_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const double *bloc, const double *bdst, int root );
+int  d_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void s_spm_print_check( char *filename, const spmatrix_t *spm );
 int  s_spm_matvec_check( spm_trans_t trans, const spmatrix_t *spm );
@@ -106,6 +109,7 @@ int  s_spm_norm_check( const spmatrix_t *spm );
 int  s_spm_dist_norm_check( const spmatrix_t *spm, const spmatrix_t *spmdist );
 int  s_spm_dist_genrhs_check( const spmatrix_t *spm, spm_int_t nrhs,
                               const float *bloc, const float *bdst, int root );
+int  s_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm );
 
 void p_spm_print_check( char *filename, const spmatrix_t *spm );
 
diff --git a/tests/z_spm_tests.c b/tests/z_spm_tests.c
index dcbfebc3f47ebacb3497a118d3ebe0871eb0c9d1..c53a0900c9c845312b8f437c1456d24b152c6b0f 100644
--- a/tests/z_spm_tests.c
+++ b/tests/z_spm_tests.c
@@ -353,7 +353,7 @@ z_spm_dist_genrhs_check( const spmatrix_t      *spm,
     norm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', M, nrhs, bloc, M );
 
     /* Let's gather the distributed RHS */
-    tmp = z_spmReduceRHS( spm, nrhs, bdst, spm->nexp, root );
+    tmp = z_spmGatherRHS( spm, nrhs, bdst, spm->nexp, root );
 
     rc = 0;
     if ( tmp != NULL ) {
@@ -385,4 +385,116 @@ z_spm_dist_genrhs_check( const spmatrix_t      *spm,
 
     return ret;
 }
+
+/*------------------------------------------------------------------------
+ *  Check the accuracy of the solution
+ */
+int
+z_spm_dist_matvec_check( spm_int_t baseval, spm_trans_t trans, const spmatrix_t *spm )
+{
+    spmatrix_t            *spmloc;
+    unsigned long long int seed  = 35469;
+    unsigned long long int seedx = 24632;
+    unsigned long long int seedy = 73246;
+    spm_complex64_t       *x, *y, *yd;
+    /*
+     * Alpha and beta are complex for cblas, but only the real part is used for
+     * matvec/matmat subroutines
+     */
+    double dalpha = 0.;
+    double dbeta  = 0.;
+
+    double    Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm;
+    double    eps, result;
+    int       rc, info_solution, start = 1;
+    spm_int_t ldl, ldd, nrhs = 1;
+
+    eps = LAPACKE_dlamch_work('e');
+
+    core_dplrnt( 1, 1, &dalpha, 1, 1, start, 0, seed ); start++;
+    core_dplrnt( 1, 1, &dbeta,  1, 1, start, 0, seed ); start++;
+
+    ldd = spm->nexp;
+    ldl = spm->gNexp;
+
+    /* Generate random x and y in distributed */
+    x = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) );
+    z_spmRhsGenRndDist( spm, baseval, 1., nrhs, x, spm->nexp, 1, seedx );
+
+    y = (spm_complex64_t*)malloc( ldd * nrhs * sizeof(spm_complex64_t) );
+    z_spmRhsGenRndDist( spm, baseval, 1., nrhs, y, spm->nexp, 1, seedy );
+
+    /* Compute the distributed sparse matrix-vector product */
+    rc = spmMatMat( trans, nrhs, dalpha, spm, x, ldd, dbeta, y, ldd );
+    if ( rc != SPM_SUCCESS ) {
+        info_solution = 1;
+        goto end;
+    }
+
+    /* Compute matrix norm and gather info */
+    Anorm  = spmNorm( SpmInfNorm, spm );
+    spmloc = spmGather( spm, 0 );
+    yd     = z_spmGatherRHS( spm, nrhs, y, ldd, 0 );
+
+    /* Compute the local sparse matrix-vector product */
+    if ( spm->clustnum == 0 ) {
+        spm_complex64_t *xl, *yl;
+
+       /* Generate xl and yl as x and y locally on 0 */
+        xl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) );
+        z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, xl, spm->nexp, 1, seedx );
+
+        yl = (spm_complex64_t*)malloc( ldl * nrhs * sizeof(spm_complex64_t) );
+        z_spmRhsGenRndShm( spmloc, baseval, 1., nrhs, yl, spm->nexp, 1, seedy );
+
+        /* Compute the original norms */
+        Xnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, xl, ldl );
+        Ynorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl );
+
+        rc = spmMatMat( trans, nrhs, dalpha, spmloc, xl, ldl, dbeta, yl, ldl );
+        if ( rc != SPM_SUCCESS ) {
+            info_solution = 1;
+            goto end;
+        }
+
+        /* Compute the final norm in shared memory */
+        Ylnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yl, ldl );
+        Ydnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'I', ldl, nrhs, yd, ldl );
+
+        core_zgeadd( SpmNoTrans, ldl, nrhs,
+                     -1., yl, ldl,
+                      1., yd, ldl );
+        Rnorm = LAPACKE_zlange( LAPACK_COL_MAJOR, 'M', ldl, nrhs, yd, ldl );
+
+
+        result = Rnorm / ( (Anorm + Xnorm + Ynorm) * spm->gNexp * eps );
+        if (  isinf(Ydnorm) || isinf(Ylnorm) ||
+              isnan(result) || isinf(result) || (result > 10.0) )
+        {
+            info_solution = 1;
+            printf( "FAILED !\n" );
+                    /* "  ||A||_inf = %e, ||x||_inf = %e, ||y||_inf = %e\n" */
+                    /* "  ||shm(a*A*x+b*y)||_inf = %e, ||dist(a*A*x+b*y)||_inf = %e, ||R||_m = %e\n", */
+                    /* Anorm, Xnorm, Ynorm, Ylnorm, Ydnorm, Rnorm ); */
+        }
+        else {
+            info_solution = 0;
+            printf("SUCCESS !\n");
+        }
+
+        free( xl );
+        free( yl );
+        free( yd );
+        spmExit( spmloc );
+        free( spmloc );
+    }
+
+    MPI_Bcast( &info_solution, 1, MPI_INT, 0, spm->comm );
+
+  end:
+    free( x );
+    free( y );
+
+    return info_solution;
+}
 #endif