diff --git a/include/main.h b/include/main.h
index dfc04f7991a7c2f9241c10dca576c3648f53d43b..9669b8023c2958f103d10557d2955571a2641bcb 100644
--- a/include/main.h
+++ b/include/main.h
@@ -1,6 +1,4 @@
-#ifndef __MAIN_H_INCLUDE
-#define __MAIN_H_INCLUDE 1
-
+#pragma once
 #include "config.h"
 
 /* ================================================================================= */
@@ -40,62 +38,34 @@ enum numModel {
 
   Default is _bem. Can be set with --bem, --fem, --fembem.
 */
-_EXTERN_TEST_FEMBEM_ enum numModel testedModel
-#ifdef ___TEST_FEMBEM___
-=_bem
-#endif
-  ;
+extern enum numModel testedModel;
 
 /*! \brief Number of right hand sides for the test */
-_EXTERN_TEST_FEMBEM_ int nbRHS
-#ifdef ___TEST_FEMBEM___
-= 1
-#endif
-  ;
-  
+extern int nbRHS;
+
 /*! \brief Sparsity of the right hand sides (1 non-zero value every 'n' values). Default is nbpts/80.log10(nbpts).
 
 The objective of using 'sparseRHS' is to speed-up the computation in produitClassique() in classic.c.
 */
-_EXTERN_TEST_FEMBEM_ int sparseRHS
-#ifdef ___TEST_FEMBEM___
-= -1
-#endif
-  ;
-  
+extern int sparseRHS;
+
 /*! \brief Use new way to compute RHS. Default is True. */
-_EXTERN_TEST_FEMBEM_ int newRHS
-#ifdef ___TEST_FEMBEM___
-= 1
-#endif
-  ;
+extern int newRHS;
 
 /*! \brief Number of points in the points cloud (BEM+FEM) */
-_EXTERN_TEST_FEMBEM_ int nbPts
-#ifdef ___TEST_FEMBEM___
-= 16000
-#endif
-  ;
+extern int nbPts;
 
 /*! \brief Tile size for the CHAMELEON solver (change it with '--chameleon-tile') */
-_EXTERN_TEST_FEMBEM_ int tileSize
-#ifdef ___TEST_FEMBEM___
-= 1024
-#endif
-    ;
+extern int tileSize;
 
 /*! \brief Number of points in the points cloud (BEM main cylinder only) */
-_EXTERN_TEST_FEMBEM_ int nbPtsMain ;
+extern int nbPtsMain ;
 
 /*! \brief Number of points in the points cloud (BEM part only) */
-_EXTERN_TEST_FEMBEM_ int nbPtsBEM ;
+extern int nbPtsBEM ;
 
 /*! \brief RHS for my tests */
-_EXTERN_TEST_FEMBEM_ void *rhs
-#ifdef ___TEST_FEMBEM___
-= NULL
-#endif
-  ;
+extern void *rhs;
 
 /*! \brief Using partial interactions between BEM and FEM unknowns
 
@@ -104,123 +74,58 @@ _EXTERN_TEST_FEMBEM_ void *rhs
  It is used to mimic the case where a part of the BEM mesh is NOT the outer surface of the FEM mesh.
  It changes the content of the matrix and hence the results, timings, accuracy, etc.
 */
-_EXTERN_TEST_FEMBEM_ int partial_fembem
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int partial_fembem;
 
 /*! \brief Verbosity flag */
-_EXTERN_TEST_FEMBEM_ int verboseTest
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int verboseTest;
 
 /*! \brief NTiles recursive clustering enabled */
-_EXTERN_TEST_FEMBEM_ int use_ntiles_rec
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int use_ntiles_rec;
 
 /*! \brief flag to activate result check */
-_EXTERN_TEST_FEMBEM_ int check_result
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int check_result;
 
 /*! \brief flag to activate traces generation */
-_EXTERN_TEST_FEMBEM_ int generate_traces
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int generate_traces;
 
 /*! \brief flag to active an overmeshed detail */
-_EXTERN_TEST_FEMBEM_ int useDetail
-#ifdef __TEST_FEMBEM__
-= 0
-#endif
-  ;
+extern int useDetail;
 
 /*! \brief Flag to compute and display matrix norms during Hmat tests */
-_EXTERN_TEST_FEMBEM_ int computeNorm
-#ifdef __TEST_FEMBEM__
-= 0
-#endif
-  ;
+extern int computeNorm;
 
 /*! \brief Symmetry flag for the content of the matrices (0=nosym, 1=sym, 2=sym pos def, 1 by default) */
-_EXTERN_TEST_FEMBEM_ int symMatContent
-#ifdef ___TEST_FEMBEM___
-= 1
-#endif
-  ;
+extern int symMatContent;
 
 /*! \brief Symmetry flag for the solver (true by default) */
-_EXTERN_TEST_FEMBEM_ int symMatSolver
-#ifdef ___TEST_FEMBEM___
-= 1
-#endif
-  ;
+extern int symMatSolver;
 
 /*! \brief Simple precision accuracy flag */
-_EXTERN_TEST_FEMBEM_ int simplePrec
-#ifdef ___TEST_FEMBEM___
-= 0
-#endif
-  ;
+extern int simplePrec;
 
 /*! \brief Complex scalar type flag */
-_EXTERN_TEST_FEMBEM_ int complexALGO
-#ifdef ___TEST_FEMBEM___
-= 1
-#endif
-  ;
+extern int complexALGO;
 
 /*! \brief Scalar type used by the tests */
-_EXTERN_TEST_FEMBEM_ ScalarType stype
-#ifdef ___TEST_FEMBEM___
-= DOUBLE_COMPLEX
-#endif
-  ;
+extern ScalarType stype;
 
 /*! \brief  Wavelength (for oscillatory kernels). */
-_EXTERN_TEST_FEMBEM_ double lambda
-#ifdef ___TEST_FEMBEM___
-=-1.
-#endif
-  ;
+extern double lambda;
 
 /*! \brief  Number of points per wavelength (for oscillatory kernels) on the main cylinder. */
-_EXTERN_TEST_FEMBEM_ double ptsPerLambda
-#ifdef ___TEST_FEMBEM___
-=10.
-#endif
-  ;
+extern double ptsPerLambda;
 
 /*! \brief  Number of points per wavelength (for oscillatory kernels) on the detail. */
-_EXTERN_TEST_FEMBEM_ double ptsPerLambdaDetail
-#ifdef ___TEST_FEMBEM___
-=-1.
-#endif
-  ;
+extern double ptsPerLambdaDetail;
 
 /*! \brief  Radius of the leaves in the trees. */
-_EXTERN_TEST_FEMBEM_ double radiusLeaf
-#ifdef ___TEST_FEMBEM___
-=0.
-#endif
-  ;
+extern double radiusLeaf;
 
 /*! \brief  Write a VTK file of the mesh. Enabled with option --write-mesh. */
-_EXTERN_TEST_FEMBEM_ int writeMesh
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-  ;
+extern int writeMesh;
+
+/*! \brief  Write a UNV file of the mesh. Enabled with option --write-mesh-unv. */
+extern int writeMeshUNV;
 
 /*! \brief  List of  algorithms that we can test. */
 enum algo {
@@ -238,114 +143,68 @@ enum algo {
 } ;
 
 /*! \brief  Algorithms that we want to test. */
-_EXTERN_TEST_FEMBEM_ enum algo testedAlgo
-#ifdef ___TEST_FEMBEM___
-=_undefined
-#endif
-  ;
+extern enum algo testedAlgo;
 
 /*! \brief  Flag to use simple assembly routine for HMAT (default is 0).
 
   It can be changed with --use_simple_assembly (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int use_simple_assembly
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-  ;
+extern int use_simple_assembly;
 
 /*! \brief Value of the Hmat clustering divider (number of children per box, default is 2)
 
   It can be changed with "--hmat_divider" (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int divider
-#ifdef ___TEST_FEMBEM___
-=2
-#endif
-  ;
+extern int divider;
 
 /*! \brief  Flag to compute the residual norm after the Hmat solve (default is 0).
 
   It can be changed with --hmat_residual (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int hmat_residual
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-  ;
+extern int hmat_residual;
 
 /*! \brief  Flag to use Hmat iterative refinement (default is 0).
 
   It can be changed with --hmat_refine (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int hmat_refine
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-  ;
+extern int hmat_refine;
 /*! \brief  Flag to use Hmat shuffle clustering (default is 0).
 
   It can be changed with --use_hmat_shuffle (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int use_hmat_shuffle
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-  ;
+extern int use_hmat_shuffle;
 
 /*! \brief Value of the Hmat shuffle clustering minimum divider (minimum number of children per box, default is 2)
 
   It can be changed with "--hmat_divider_min" (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int divider_min
-#ifdef ___TEST_FEMBEM___
-=2
-#endif
-  ;
+extern int divider_min;
 
 /*! \brief Value of the Hmat shuffle clustering maximum divider (maximum number of children per box, default is 4)
 
   It can be changed with "--hmat_divider_max" (read in readFlagsTestHMAT() in hmat.c)
 */
-_EXTERN_TEST_FEMBEM_ int divider_max
-#ifdef ___TEST_FEMBEM___
-=4
-#endif
-  ;
+extern int divider_max;
 
-/*! \brief number of elements in z_matComp_FEM[]
-   */
-_EXTERN_TEST_FEMBEM_ size_t nnz_FEM
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-;
+/*! \brief Flag to use HODLR storage and algorithms (default is 0).
 
-/*! \brief  allocated size of z_matComp_FEM[]
+  It can be changed with "--hodlr" (read in readFlagsTestHMAT() in hmat.c)
+*/
+extern int use_hodlr;
+
+/*! \brief number of elements in z_matComp_FEM[]
    */
-_EXTERN_TEST_FEMBEM_ size_t nbEl_FEM
-#ifdef ___TEST_FEMBEM___
-=0
-#endif
-;
+extern size_t nnz_FEM;
 
 /*! \brief  Array of non-zero values of the FEM matrix
 */
-_EXTERN_TEST_FEMBEM_ ZnonZero*z_matComp_FEM
-#ifdef ___TEST_FEMBEM___
-=NULL
-#endif
-;
+extern ZnonZero*z_matComp_FEM;
 
 /*! \brief  beginning of each line in z_matComp_FEM[]
    */
-_EXTERN_TEST_FEMBEM_ size_t *ptr_ordered_FEM
-#ifdef ___TEST_FEMBEM___
-=NULL
-#endif
-;
-/* ============================================================================================================ */
-  /*! \brief User context for building MPF matrices and vectors in testMPF */
+extern size_t *ptr_ordered_FEM;
+
+/*! \brief User context for building MPF matrices and vectors in testMPF */
 typedef struct {
   /*! \brief Pointer on the data used to assembly Mat/vec with computeDenseBlockRHS() */
   void *dataVec;
@@ -357,7 +216,7 @@ typedef struct {
   */
   int colDim;
 } contextTestFEMBEM;
-/* ============================================================================================================ */
+
 #ifndef HAVE_HMAT
 // If we compile without hmat, we need these phony declarations to use prepare_block()/advanced_compute_hmat() in computeDenseBlockFEMBEM()
 typedef struct hmat_block_info_struct {
@@ -367,26 +226,25 @@ typedef struct hmat_block_info_struct {
   char (*is_guaranteed_null_col)(const struct hmat_block_info_struct * block_info, int block_col_offset, int stratum);
 } hmat_block_info_t;
 struct hmat_block_compute_context_t {
-    void* user_data;
-    int row_start, row_count, col_start, col_count;
-    int stratum;
-    void* block;
+  void* user_data;
+  int row_start, row_count, col_start, col_count;
+  int stratum;
+  void* block;
 };
 typedef enum {
-    hmat_factorization_none = -1,
-    hmat_factorization_lu,
-    hmat_factorization_ldlt,
-    hmat_factorization_llt
+  hmat_factorization_none = -1,
+  hmat_factorization_lu,
+  hmat_factorization_ldlt,
+  hmat_factorization_llt
 } hmat_factorization_t;
 #endif
-/* ============================================================================================================ */
-int computeKernelBEM(double *coord1, double *coord2, int self, Z_type *kernel) ;
+
+int computeKernelBEM(double *coord1, double *coord2, int self, double _Complex *kernel) ;
 int produitClassiqueBEM(int ffar, void *sol, MPI_Comm myComm) ;
 int produitClassique(void **sol, MPI_Comm myComm) ;
 int computeRelativeError(void *sol, void *ref, double *eps) ;
 int computeVecNorm(void *ref, double *norm);
 int sommePara(double *buf) ;
-int computeCoordCylinderMain(int i, double *coord) ;
 int computeCoordCylinder(int i, double *coord) ;
 int computeCoordCylinderDetail(int i, double *coord) ;
 int initCylinder(int *argc, char ***argv) ;
@@ -396,6 +254,7 @@ int getMeshStepDetail(double *m) ;
 int initPipe(int *argc, char ***argv) ;
 int computeCoordPipe(int i, double *coord) ;
 int createFemMatrix() ;
+int freeFemMatrix() ;
 int produitClassiqueFEM(void *sol, MPI_Comm myComm) ;
 double* createPipe(void) ;
 int computeKernelFEM(int i, int j, Z_type *kernel) ;
@@ -405,7 +264,7 @@ int computeRhs(void) ;
 void computeDenseBlockData (int *LigInf, int *LigSup, int *ColInf, int *ColSup, int *___Inf, int *___Sup, int *iloc, int *jloc, int *tailleS, int *tailleL, void *bloc, void *context);
 double getTime() ;
 int displayArray(char *s, void *f, int m, int n) ;
-int testHMAT(void) ;
+int testHMAT(double * relative_error) ;
 int prepareTEST(void);
 double* createCylinder(void) ;
 void interactionKernel(void* data, int i, int j, void* result) ;
@@ -416,10 +275,10 @@ void prepare_hmat(int, int, int, int, int*, int*, int*, int*, void*, hmat_block_
 void advanced_compute_hmat(struct hmat_block_compute_context_t*);
 int init_hmat_interface() ;
 int readFlagsTestHMAT(int *argc, char ***argv) ;
-int testCHAMELEON(void);
-int testHLIBPRO(void);
+int testCHAMELEON(double * relative_error);
+int testHLIBPRO(double * relative_error);
 int initHLIBPRO(int* argc, char*** argv) ;
-int testHCHAMELEON(void);
+int testHCHAMELEON(double * relative_error);
 #ifdef HAVE_CHAMELEON
 extern cham_flttype_t chameleonType;
 extern int (*CHAMELEON_sytrf_Tile)( cham_uplo_t, CHAM_desc_t * );
@@ -434,7 +293,7 @@ int testFEMBEM_initChameleon( cham_flttype_t stype );
 int CHAMELEON_gemm_Tile( CHAM_desc_t *descA, CHAM_desc_t *descX, CHAM_desc_t *descY );
 
 int CHAMELEON_generate_matrix( cham_flttype_t flttype, int NB, int PQ[2],
-			       CHAM_desc_t     **descA );
+CHAM_desc_t     **descA );
 int CHAMELEON_destroy_matrix( CHAM_desc_t *descA );
 #endif
 
@@ -452,14 +311,14 @@ struct HCHAM_desc_s {
 };
 
 int         HCHAMELEON_generate_matrix( cham_flttype_t flttype, int NB, int PQ[2],
-					HCHAM_desc_t     *hdescA,
-					hmat_interface_t *hi );
+HCHAM_desc_t     *hdescA,
+hmat_interface_t *hi );
 int          HCHAMELEON_destroy_matrix( HCHAM_desc_t *hdescA );
 hmat_info_t  HCHAMELEON_getinfo( HCHAM_desc_t *hdesc );
 CHAM_desc_t *HCHAMELEON_uncompress_matrix( HCHAM_desc_t *hdesc );
 void         HCHAMELEON_lapmr( char forward, char trans, int nrhs,
-			       HCHAM_desc_t *hdesc,
-			       char *Bptr );
+                               HCHAM_desc_t *hdesc,
+                               char *Bptr );
 #endif
 
 #ifdef HAVE_HMAT
@@ -467,16 +326,15 @@ struct HMAT_desc_s;
 typedef struct HMAT_desc_s HMAT_desc_t;
 
 struct HMAT_desc_s {
-    hmat_matrix_t               *hmatrix;
-    hmat_clustering_algorithm_t *clustering;
-    hmat_cluster_tree_t         *cluster_tree;
-    hmat_admissibility_t        *admissibilityCondition;
-    contextTestFEMBEM           *myCtx;
+  hmat_matrix_t               *hmatrix;
+  hmat_clustering_algorithm_t *clustering;
+  hmat_cluster_tree_t         *cluster_tree;
+  hmat_admissibility_t        *admissibilityCondition;
+  contextTestFEMBEM           *myCtx;
 };
 
 HMAT_desc_t *HMAT_generate_matrix( hmat_interface_t *hi );
 void         HMAT_destroy_matrix ( hmat_interface_t *hi,
-				   HMAT_desc_t      *hdesc );
-#endif
-/* ============================================================================================================ */
+                                   HMAT_desc_t      *hdesc );
 #endif
+
diff --git a/include/util.h b/include/util.h
index 88e14b228183217c269493789b548b25c5d91484..6753c5c7335b31d77a60013b057f381fef35a62b 100644
--- a/include/util.h
+++ b/include/util.h
@@ -315,6 +315,7 @@ struct mpf_hmat_settings_t {
   hmat_compress_t compressionMethod;
   double epsilon;
   double acaEpsilon;
+  int max_leaf_size;
 };
 _EXTERN_TEST_FEMBEM_ struct mpf_hmat_settings_t mpf_hmat_settings;
 struct mpf_hmat_create_compression_args_t {
@@ -391,6 +392,11 @@ inline static void mpf_backtrace(){}
 /*! \brief Evaluates the expression x, if it is false then it displays an error message and aborts the code. */
 #define ASSERTA(x) { if (!(x)) { SETERRA(1, "Assert failure %s", #x); }}
 
+#if !defined (MIN)
+  /*! \brief Minimum value of two elements. */
+#define MIN(a,b)           ( ((a)<(b)) ? (a) : (b) )
+#endif
+
 #define MpfCalloc(a,b) calloc(a,b)
 #define MpfMalloc(a,b) malloc(a)
 #define MpfRealloc(a,b) realloc(a,b)
@@ -415,6 +421,7 @@ int Mpf_printf(MPI_Comm comm, const char *fmt, ...);
 int MpfArgGetDouble( int *Argc, char **argv, int rflag, const char *name, double *val ) ;
 int MpfArgHasName( int *Argc, char **argv, int rflag, const char *name );
 int MpfArgGetInt( int  *Argc, char **argv, int rflag, const char *name, int  *val );
+int MPI_AllreduceL(const void *sendbuf, void *recvbuf, ssize_t count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) ;
 int S_sortAndCompact(SnonZero *mat, int *size);
 int sortAndCompact(DnonZero *mat, int *size);
 int C_sortAndCompact(CnonZero *mat, int *size);
diff --git a/src/chameleon.c b/src/chameleon.c
index d1af80f5d396d60f8a211ac2290e05099980dc18..98c91fe9d8a5ee451358c0b66f8b57510862793b 100644
--- a/src/chameleon.c
+++ b/src/chameleon.c
@@ -91,7 +91,6 @@ int CHAMELEON_gemm_Tile( CHAM_desc_t *descA,
   return ierr;
 }
 
-/* ============================================================================================================ */
 // indices are 0 based, bounds included
 static int
 CHAMELEON_build_callback_FEMBEM( const CHAM_desc_t *desc,
@@ -120,7 +119,7 @@ CHAM_desc_t     **descA )
   int ierr;
   int N = nbPts;
   cham_uplo_t uplo = symMatSolver ? ChamLower : ChamUpperLower;
-  contextTestFEMBEM *myCtx = (contextTestFEMBEM*)MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRQ(myCtx);
+  contextTestFEMBEM *myCtx = MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRQ(myCtx);
   myCtx->colDim = N;
 
   /**
diff --git a/src/classic.c b/src/classic.c
index 262007eb4aefff6df1fa45fa68cbdbf8404cd6fa..86217fe922a0f07b2f34e266ba33f70cc6e0655a 100644
--- a/src/classic.c
+++ b/src/classic.c
@@ -31,34 +31,38 @@ int produitClassiqueBEM(int ffar, void *sol, MPI_Comm myComm) {
     Z_type *z_sol=sol, *z_rhs=rhs ;
     D_type *d_sol=sol, *d_rhs=rhs ;
 
-#ifdef _OPENMP
-    if (omp_get_thread_num()==0)
-#endif
-    {
-      ierr=Mpf_progressBar(myComm, jg+1, nbPtsBEM+1) ;
-    }
-
     ierr=computeCoordCylinder(j, &(coord_j[0])) ; CHKERRA(ierr);
     /* Boucle sur les lignes */
     for (i=myProc ; i<nbPtsBEM ; i+=nbProcs) {
       ierr=computeCoordCylinder(i, &(coord_i[0])) ; CHKERRA(ierr);
-      ierr=computeKernelBEM(&(coord_i[0]), &(coord_j[0]), i==j, &Aij) ;
-      if (complexALGO) {
-        for (k=0 ; k<nbRHS ; k++) {
+      {
+        ierr=computeKernelBEM(&(coord_i[0]), &(coord_j[0]), i==j, (double _Complex *)&Aij) ;
+        if (complexALGO) {
+          for (k=0 ; k<nbRHS ; k++) {
 #pragma omp atomic
-          z_sol[i+k*nbPts].r += Aij.r * z_rhs[j+k*nbPts].r - Aij.i * z_rhs[j+k*nbPts].i ;
+            z_sol[(size_t)k*nbPts+i].r += Aij.r * z_rhs[(size_t)k*nbPts+j].r - Aij.i * z_rhs[(size_t)k*nbPts+j].i ;
 #pragma omp atomic
-          z_sol[i+k*nbPts].i += Aij.r * z_rhs[j+k*nbPts].i + Aij.i * z_rhs[j+k*nbPts].r ;
-        }
-      } else {
-        for (k=0 ; k<nbRHS ; k++) {
+            z_sol[(size_t)k*nbPts+i].i += Aij.r * z_rhs[(size_t)k*nbPts+j].i + Aij.i * z_rhs[(size_t)k*nbPts+j].r ;
+          }
+        } else {
+          for (k=0 ; k<nbRHS ; k++) {
 #pragma omp atomic
-          d_sol[i+k*nbPts] += Aij.r * d_rhs[j+k*nbPts] ;
+            d_sol[(size_t)k*nbPts+i] += Aij.r * d_rhs[(size_t)k*nbPts+j] ;
+          }
         }
-      }
+      } /* if testDofNeighbour(.. */
     } /* for (i=0 */
-#pragma omp atomic
-    jg+=sparseRHS ;
+
+    int jg_bak;
+#pragma omp atomic capture
+    jg_bak = jg+=sparseRHS ;
+
+#ifdef _OPENMP
+    if (omp_get_thread_num()==0)
+#endif
+    {
+      ierr=Mpf_progressBar(myComm, jg_bak+1, nbPtsBEM+1) ;
+    }
   } /* for (j=0 */
   ierr=Mpf_progressBar(myComm, nbPtsBEM+1, nbPtsBEM+1) ;
 
@@ -73,7 +77,7 @@ int produitClassique(void **sol, MPI_Comm myComm) {
   int ierr;
   double temps_initial, temps_final, temps_cpu;
 
-  void *solCLA=(void*)MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ; CHKPTRQ(solCLA) ;
+  void *solCLA = MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ; CHKPTRQ(solCLA) ;
 
   if (0) // piece of code to test the relative norms of FEM and BEM parts in the output of the product
     // because if one of the 2 is negligible, then our test_FEMBEM doesn't realy test it...
@@ -82,11 +86,11 @@ int produitClassique(void **sol, MPI_Comm myComm) {
       // Compare the norms of the BEM and FEM parts
       ierr = produitClassiqueBEM(2, solCLA, myComm) ; CHKERRQ(ierr) ;
       ierr = computeVecNorm(solCLA, &normBEM); CHKERRQ(ierr);
-      memset(solCLA, 0, nbPts*nbRHS* (complexALGO ? sizeof(Z_type) : sizeof(D_type)));
+      memset(solCLA, 0, (size_t)nbPts*nbRHS* (complexALGO ? sizeof(Z_type) : sizeof(D_type)));
 
       ierr = produitClassiqueFEM(solCLA, myComm) ; CHKERRQ(ierr) ;
       ierr = computeVecNorm(solCLA, &normFEM); CHKERRQ(ierr);
-      memset(solCLA, 0, nbPts*nbRHS* (complexALGO ? sizeof(Z_type) : sizeof(D_type)));
+      memset(solCLA, 0, (size_t)nbPts*nbRHS* (complexALGO ? sizeof(Z_type) : sizeof(D_type)));
       Mpf_printf(myComm, "normBEM=%g normFEM=%g ratio FEM/BEM=%g\n", normBEM, normFEM, normFEM/normBEM);
     }
   temps_initial = getTime ();
diff --git a/src/compare.c b/src/compare.c
index 3fb66af03fc2881cef65287039d408ea0296f3f7..8e2a5f69bb0f8e2dc61a02b8f867a2d18d2127f3 100644
--- a/src/compare.c
+++ b/src/compare.c
@@ -18,7 +18,7 @@ frobenius_update( double *scale, double *sumsq, const double value )
 }
 
 int computeRelativeError(void *sol, void *ref, double *eps) {
-  int i, ierr, rank ;
+  int ierr, rank ;
   double normRef[2], normDelta[2];
   Z_type *z_sol=sol, *z_ref=ref ;
   D_type *d_sol=sol, *d_ref=ref ;
@@ -30,7 +30,7 @@ int computeRelativeError(void *sol, void *ref, double *eps) {
     normDelta[1] = 0.;
     normRef[0]   = 1.;
     normRef[1]   = 0.;
-    for (i=0 ; i<nbPts*nbRHS ; i++) {
+    for (size_t i=0 ; i<(size_t)nbPts*nbRHS ; i++) {
       if (complexALGO) {
         frobenius_update( normDelta, normDelta+1, (z_sol[i].r - z_ref[i].r) );
         frobenius_update( normDelta, normDelta+1, (z_sol[i].i - z_ref[i].i) );
@@ -55,7 +55,7 @@ int computeRelativeError(void *sol, void *ref, double *eps) {
 }
 
 int computeVecNorm(void *ref, double *norm) {
-  int i, ierr, rank ;
+  int ierr, rank ;
   double normRef ;
   Z_type *z_ref=ref ;
   D_type *d_ref=ref ;
@@ -64,7 +64,7 @@ int computeVecNorm(void *ref, double *norm) {
   *norm=0.;
   if (rank==0) {
     normRef=0. ;
-    for (i=0 ; i<nbPts*nbRHS ; i++) {
+    for (size_t i=0 ; i<(size_t)nbPts*nbRHS ; i++) {
       if (complexALGO) {
         normRef += z_ref[i].r*z_ref[i].r+z_ref[i].i*z_ref[i].i ;
       } else {
@@ -78,12 +78,10 @@ int computeVecNorm(void *ref, double *norm) {
 }
 
 int sommePara(double *buf) {
-  int ierr, n ;
-
-  n = (complexALGO?2:1)*nbPts*nbRHS ;
+  size_t n = (size_t)(complexALGO?2:1)*nbPts*nbRHS ;
   /* En parallele, somme les resultats des differents processeurs vers tous les proc */
   /* MPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm) */
-  ierr=MPI_Allreduce(MPI_IN_PLACE, buf, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
+  int ierr=MPI_AllreduceL(MPI_IN_PLACE, buf, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
 
   return 0 ;
 }
diff --git a/src/cylinder.c b/src/cylinder.c
index 1cd0bc3cfe0de457d7a96c422551387d973d1261..d50087c208dbac988bdc620c6e0ca097ab2bb741 100644
--- a/src/cylinder.c
+++ b/src/cylinder.c
@@ -26,7 +26,6 @@ static int nbPtsLoop=0;
 /*! \brief Number of points per loops in the BEM mesh (detail cylinder) */
 static int nbPtsLoopDetail=0;
 
-/* ============================================================================================================ */
 /*! \brief Computes coordinates of point number 'i'
 
   The points are computed on a cylinder defined by 'radius' and 'height'. The number of points is given by nbPtsMain.
@@ -36,7 +35,8 @@ static int nbPtsLoopDetail=0;
   \param coord the coordinates x y z of this point (output)
   \return 0 for success
 */
-int computeCoordCylinderMain(int i, double *coord) {
+inline static int computeCoordCylinderMain(int i, double *coord) {
+  ASSERTQ(testedModel == _bem);
   static double angleStep=0. ;
   static double zStep=0. ;
   double theta ;
@@ -73,6 +73,8 @@ int computeCoordCylinderMain(int i, double *coord) {
 */
 int computeCoordCylinder(int i, double *coord) {
   ASSERTQ(i<nbPtsBEM && i>=0);
+  if (testedModel == _fembem)
+    return computeCoordPipe(i, coord);
   if (useDetail && i >= nbPtsMain) {
     i = i - nbPtsMain;
     return computeCoordCylinderDetail(i, coord);
@@ -92,6 +94,7 @@ int computeCoordCylinder(int i, double *coord) {
   \return 0 for success
 */
 int computeCoordCylinderDetail(int i, double *coord) {
+  ASSERTQ(testedModel == _bem);
   static double angleStep = 0. ;
   static double zStep = 0. ;
   double theta ;
@@ -117,18 +120,18 @@ int computeCoordCylinderDetail(int i, double *coord) {
   /* Le detail est oriente selon y, perpendiculaire au cylindre principal, a mi-hauteur */
   theta = (double)i * angleStep ;
   coord[0] = radiusD * sin(theta) ;
-  coord[1] = zStep * (i+1) + radius ;
+  coord[1] = zStep * (double)(i+1) + radius ;
   coord[2] = radiusD * cos(theta) + 0.5 * height ;
 
   return 0 ;
 }
-/* ============================================================================================================ */
+
 double* createCylinder(void) {
-  double* result = (double*) MpfCalloc((size_t)3 * nbPts, sizeof(double));
+  double* result = MpfCalloc((size_t)3 * nbPts, sizeof(double));
   int i, ierr;
 #pragma omp parallel for private(ierr) schedule(dynamic,10000)
   for (i = 0; i < nbPts; i++) {
-    ierr=computeCoordCylinder(i, &(result[3*i]) ) ; CHKERRA(ierr) ;
+    ierr=computeCoordCylinder(i, &(result[(size_t)3*i]) ) ; CHKERRA(ierr) ;
   }
 
   int rank;
@@ -141,7 +144,12 @@ double* createCylinder(void) {
                   "byte_order=\"LittleEndian\" compressor=\"vtkZLibDataCompressor\">\n");
     fprintf(fvtk, "\t<UnstructuredGrid>\n");
 
-    int nbTria = 2*(nbPts-nbPtsLoop-nbPtsLoopDetail-1);
+    ASSERTA(nbPtsMain-nbPtsLoop-1 < INT_MAX/2);
+    int nbTria = 2*(nbPtsMain-nbPtsLoop-1);
+    if (nbPtsLoopDetail>0) {
+      ASSERTA(nbTria < INT_MAX-2*(nbPts-nbPtsLoopDetail-nbPtsMain-1));
+      nbTria += 2*(nbPts-nbPtsLoopDetail-nbPtsMain-1);
+    }
     fprintf(fvtk, "\t\t<Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n", nbPts, nbTria);
     // Write the vertices
     fprintf(fvtk, "\t\t\t<Points>\n");
@@ -164,7 +172,7 @@ double* createCylinder(void) {
       // 2nd Triangle
       fprintf(fvtk, "%d %d %d\n", iVtk+1, iVtk+nbPtsLoop, iVtk+nbPtsLoop+1);
     }
-    /* Loop on the triangles (main cylinder) */
+    /* Loop on the triangles (detail) */
     for (int iVtk = nbPtsMain; iVtk+nbPtsLoopDetail+1 < nbPts; iVtk++) {
       // 1st triangle
       fprintf(fvtk, "%d %d %d\n", iVtk, iVtk+1, iVtk+nbPtsLoopDetail);
@@ -195,13 +203,83 @@ double* createCylinder(void) {
     fprintf(fvtk, "\t\t\t</Cells>\n");
     fprintf(fvtk, "\t\t</Piece>\n\t</UnstructuredGrid>\n</VTKFile>\n");
     fclose(fvtk);
+    Mpf_printf(MPI_COMM_WORLD, "Done writing 'mesh.vtu'\n") ;
+  }
+
+  if (rank==0 && writeMeshUNV) {
+#define sNODE_UNV_ID "  2411"
+#define sELT_UNV_ID  "  2412"
+#define sGRP_UNV_ID  "  2435"
+#define sUNV_SEPARATOR    "    -1"
+#define sNODE_UNV_DESCR   "         1         1        11"
+#define sELT_TRIA3_DESC   "        91         1         1         5         3"
+#define sELT_TETRA4_DESC  "       111         1         2         9         4"
+    Mpf_printf(MPI_COMM_WORLD, "Writing 'mesh.unv'... ") ;
+
+    FILE *funv=fopen("mesh.unv", "w");
+    /* -------- */
+    /* Vertices */
+    /* -------- */
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    fprintf(funv, "%s\n", sNODE_UNV_ID );
+    double* coord=result;
+    for( i = 1; i<=nbPts; i++, coord+=3 ) {
+      fprintf(funv, "%10d%s\n", i, sNODE_UNV_DESCR );
+      fprintf(funv, "%25.16E%25.16E%25.16E\n", coord[0], coord[1], coord[2]) ;
+    }
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    /* -------- */
+    /* Elements */
+    /* -------- */
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    fprintf(funv, "%s\n", sELT_UNV_ID );
+    int nbTria=0;
+    /* Loop on the triangles (main cylinder) */
+    for (int iVtk = 1; iVtk+nbPtsLoop+1 <= nbPtsMain; iVtk++) {
+      // 1st triangle
+      fprintf(funv, "%10d%s\n", ++nbTria, sELT_TRIA3_DESC ) ;
+      fprintf(funv, "%10d%10d%10d\n", iVtk, iVtk+1, iVtk+nbPtsLoop) ;
+      // 2nd Triangle
+      fprintf(funv, "%10d%s\n", ++nbTria, sELT_TRIA3_DESC ) ;
+      fprintf(funv, "%10d%10d%10d\n", iVtk+1, iVtk+nbPtsLoop, iVtk+nbPtsLoop+1);
+    }
+    /* Loop on the triangles (detail) */
+    for (int iVtk = nbPtsMain+1; iVtk+nbPtsLoopDetail+1 <= nbPts; iVtk++) {
+      // 1st triangle
+      fprintf(funv, "%10d%s\n", ++nbTria, sELT_TRIA3_DESC ) ;
+      fprintf(funv, "%10d%10d%10d\n", iVtk, iVtk+1, iVtk+nbPtsLoopDetail) ;
+      // 2nd Triangle
+      fprintf(funv, "%10d%s\n", ++nbTria, sELT_TRIA3_DESC ) ;
+      fprintf(funv, "%10d%10d%10d\n", iVtk+1, iVtk+nbPtsLoopDetail, iVtk+nbPtsLoopDetail+1);
+    }
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    /* ------ */
+    /* Groups */
+    /* ------ */
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    fprintf(funv, "%s\n", sGRP_UNV_ID );
+    /* Group C_EXT for all the triangles */
+    fprintf(funv, "%10d%10d%10d%10d%10d%10d%10d%10d\n", 1, 0, 0, 0, 0, 0, 0, nbTria) ;
+    fprintf(funv, "C_EXT\n") ;
+    for (i=1 ; i<=nbTria ; i++) {
+      fprintf(funv, "%10d%10d%10d%10d", 8, i, 0, 0) ;
+      if ( (i%2==0) || (i==nbTria) )
+        fprintf(funv, "\n") ;
+    }
+    fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+    fclose(funv);
+    Mpf_printf(MPI_COMM_WORLD, "Done.\n") ;
   }
 
+  // In case we pass a second time in this routine
+  writeMesh = 0;
+  writeMeshUNV = 0;
+
   return result;
 }
-/* ============================================================================================================ */
-/*! \brief Reads command line arguments defining the shape of the cylinder 
-  
+
+/*! \brief Reads command line arguments defining the shape of the cylinder
+
   \return 0 for success
 */
 int initCylinder(int *argc, char ***argv) {
@@ -226,7 +304,6 @@ int initCylinderDetail(int *argc, char ***argv) {
   double coord[3] ;
   double step=0;
 
-
   // ATTENTION : LES MESURES DU CYLINDRE DU DETAIL SONT EN NOMBRE DE LAMBDA
 
   if (MpfArgGetDouble(argc, *argv, 1, "-radius_detail", &radiusDetail)) {
@@ -255,7 +332,6 @@ int initCylinderDetail(int *argc, char ***argv) {
   return 0 ;
 }
 
-/* ============================================================================================================ */
 int getMeshStep(double *m) {
   if (meshStep==0.)
     SETERRQ(1, "meshStep not yet initialized. Come back later.") ;
@@ -271,4 +347,3 @@ int getMeshStepDetail(double *m) {
   *m = meshStepDetail ;
   return 0 ;
 }
-/* ============================================================================================================ */
diff --git a/src/hchameleon.c b/src/hchameleon.c
index 7d701637f71798a4cad5766a65bae51cd0427089..fad9c0f5367f427462def1ec0099f82dae7601ea 100644
--- a/src/hchameleon.c
+++ b/src/hchameleon.c
@@ -13,6 +13,7 @@
 /**
  * Detect if the tile is local or not
  */
+#if 0
 inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n )
 {
 #if defined(CHAMELEON_USE_MPI)
@@ -22,6 +23,7 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n )
   return 1;
 #endif /* defined(CHAMELEON_USE_MPI) */
 }
+#endif
 
 // indices are 0 based, bounds included
 /* Initialize a given tile with an h-matrix, given the row and cluster tree */
@@ -146,7 +148,7 @@ hmat_interface_t *hi )
      *   We exploit the fact that row and column distributions are
      *   identical to initialize only on rows.
      */
-  hdescA->perm = (int *)MpfCalloc( descA->lm, sizeof(int) );
+  hdescA->perm = MpfCalloc( descA->lm, sizeof(int) );
 
   hmat_cluster_tree_t *main_cluster = (hmat_cluster_tree_t *) hmat_create_cluster_tree_from_builder( points, 3, nbPts,
                                                                                 ct_builder );
@@ -155,7 +157,7 @@ hmat_interface_t *hi )
   memcpy( hdescA->perm,hmat_cluster_get_indices( main_cluster ),
           descA->lm * sizeof(int) );
 
-  hdescA->clusters = (hmat_cluster_tree_t **)MpfCalloc( descA->lmt + 1, sizeof(hmat_cluster_tree_t *) );
+  hdescA->clusters = MpfCalloc( descA->lmt + 1, sizeof(hmat_cluster_tree_t *) );
   clusters = hdescA->clusters;
   if ( descA->lmt == 1 ) {
     *clusters = main_cluster;
@@ -215,7 +217,6 @@ HCHAMELEON_destroy_matrix( HCHAM_desc_t *hdescA )
 
   hmat_delete_admissibility( hdescA->admissibilityCondition );
 
-
   hdescA->hi->dump_info( hdescA->hmatrix, "testHCHAMELEON_matrix");
 
   /* Destroys the HMAT structure */
@@ -249,7 +250,6 @@ HCHAMELEON_destroy_matrix( HCHAM_desc_t *hdescA )
   return 0;
 }
 
-/* ============================================================================================================ */
 /* Provides a flat matrix (array of values) given an hmatrix structure */
 typedef	int (*core_lacpy_fct_t)( cham_uplo_t, int, int, const void *, int, void *, int );
 
diff --git a/src/help.c b/src/help.c
index 73c55a93d032e9dbda0b37ebdc65516f86d51c1b..9ed0ba4ca2c7079b1dab03a1a55ca5ffff8dca97 100644
--- a/src/help.c
+++ b/src/help.c
@@ -7,7 +7,7 @@ int printHelp() {
 
   printf("\ntest_FEMBEM usage:\n-----------------\n\n"
          "     --bem/--fem/--fembem: Numerical model (bem is dense, fem is sparse). Default: bem.\n"
-         "     -nbpts: Number of unknowns of the global linear system. Default: 16000.\n"
+         "     -nbpts: Number of unknowns of the global linear system. Default: 16000 (can be in floating point notation, like 1.2e4).\n"
          "     -nbrhs: Number of right-hand sides of the global linear system. Default: 1.\n"
          "     -s/-d/-c/-z: Scalar type of the computation (real computation uses 1/R kernel in BEM, complex computation uses oscillatory exp(ikr)/r kernel). Default: z.\n"
          "     -v: Verbose mode, to display values of data (extremely verbose). Default is OFF.\n"
@@ -16,6 +16,7 @@ int printHelp() {
          "     -lambda: Wavelength (for oscillatory kernels). Mutually exclusive with -ptsperlambda. Default lambda is set with 10 points per wavelength.\n"
          "     -ptsperlambda: Points per wavelength on the main cylinder (for oscillatory kernels). Default: 10.\n"
          "     --write-mesh: Write a VTK file 'mesh.vtu' of the mesh. Default is OFF.\n"
+         "     --write-mesh-unv: Write a UNV file 'mesh.unv' of the mesh. Default is OFF.\n"
          "     --compute_norm: Compute and display matrix norms during Hmat tests. Default: OFF.\n"
          "\n"
          "     BEM tests use a surfacic cylinder with an optionnal cylinder detail, defined with:\n"
@@ -71,8 +72,10 @@ int printHelp() {
          "     --use_hmat_shuffle: Using Hmat shuffle clustering. Default: NO.\n"
          "     --hmat_divider_min: Value of the Hmat shuffle clustering minimum divider (minimum number of children per box). Default: 2.\n"
          "     --hmat_divider_max: Value of the Hmat shuffle clustering maximum divider (maximum number of children per box). Default: 4.\n"
+         "     --hodlr: Using HODLR. Default: NO.\n"
          "     --hmat-eps-assemb: Compression epsilon. Default: 1e-4.\n"
          "     --hmat-eps-recompr: Recompression epsilon. Default: 1e-4.\n"
+	 "     --hmat-leaf-size: Maximum leaf size. Default: -1 (set by hmat itself).\n"
          "     Compression algorithm: default is ACA+, it can be changed with:\n"
          "     --hmat-svd: Using SVD compression.\n"
          "     --hmat-aca-partial: Using ACA with partial pivoting.\n"
@@ -124,4 +127,3 @@ int printHelp() {
 
   return 0;
 }
-
diff --git a/src/hmat.c b/src/hmat.c
index 14be0ce11aae56e4553ff68e4eb51a9e7ab37eb2..b32335884d153de89af58ff2d6c82451442e0274 100644
--- a/src/hmat.c
+++ b/src/hmat.c
@@ -5,7 +5,7 @@ int init_hmat_interface() {
   if(mpf_hmat_settings.interface_initialized)
     return 0;
   for(int i = HMAT_SIMPLE_PRECISION; i <= HMAT_DOUBLE_COMPLEX; i++) {
-    mpf_hmat_settings.interfaces[i] = (hmat_interface_t *) MpfCalloc(1, sizeof(hmat_interface_t));
+    mpf_hmat_settings.interfaces[i] = MpfCalloc(1, sizeof(hmat_interface_t));
     switch(mpf_hmat_settings.engine) {
       case mpf_hmat_seq: hmat_init_default_interface(mpf_hmat_settings.interfaces[i], i); break;
 #ifdef HMAT_HAVE_TOYRT
@@ -70,6 +70,12 @@ int readFlagsTestHMAT(int *argc, char ***argv) {
     ASSERTQ(divider_max>=divider_min);
   }
 
+  /* Flag to use HODLR storage and algorithms */
+  if (MpfArgHasName(argc, *argv, 1, "--hodlr")) {
+    use_hodlr=1;
+    Mpf_printf(MPI_COMM_WORLD, "Using HODLR\n") ;
+  }
+
   return 0;
 }
 
@@ -77,7 +83,7 @@ int readFlagsTestHMAT(int *argc, char ***argv) {
 void interactionKernel(void* user_context, int i, int j, void* result) {
   int ierr;
   double coord_i[3], coord_j[3];
-  Z_type Aij=Mpf_z_zero, Bij=Mpf_z_zero ;
+  double _Complex Aij = 0, Bij = 0;
   contextTestFEMBEM *myCtx = (contextTestFEMBEM *)user_context;
 
   // Update indices (if we compute only a sub-part of the whole matrix)
@@ -93,19 +99,18 @@ void interactionKernel(void* user_context, int i, int j, void* result) {
 
   // FEM part
   if (testedModel == _fembem || testedModel == _fem) {
-    ierr=computeKernelFEM(i, j, &Bij); CHKERRA(ierr);
-    Aij.r += Bij.r;
-    Aij.i += Bij.i;
+    ierr=computeKernelFEM(i, j, (Z_type*)&Bij); CHKERRA(ierr);
+    Aij += Bij;
   }
 
   switch(stype) {
     case SIMPLE_PRECISION:
     case DOUBLE_PRECISION:
-      *((double*)result) = Aij.r;
+      *((double*)result) = Aij;
       break;
     case SIMPLE_COMPLEX:
     case DOUBLE_COMPLEX:
-      *((Z_type*)result) = Aij;
+      *((double _Complex*)result) = Aij;
       break;
     default:
       SETERRA(1, "Unknown stype");
@@ -134,11 +139,11 @@ typedef struct {
 
 static void free_block_data(void *v) {
   block_data_t* p = (block_data_t*) v;
-  if (p->col_points) MpfFree(p->col_points); p->col_points=NULL;
-  if (p->row_points) MpfFree(p->row_points); p->row_points=NULL;
-  if (p->dof_col_used) MpfFree(p->dof_col_used); p->dof_col_used=NULL;
-  if (p->dof_row_used) MpfFree(p->dof_row_used); p->dof_row_used=NULL;
-  if (p) MpfFree(p); p=NULL;
+  if (p->col_points) MpfFree(p->col_points);
+  if (p->row_points) MpfFree(p->row_points);
+  if (p->dof_col_used) MpfFree(p->dof_col_used);
+  if (p->dof_row_used) MpfFree(p->dof_row_used);
+  if (p) MpfFree(p);
 }
 
 /** Callback for hmat block info */
@@ -198,8 +203,8 @@ prepare_hmat(int row_start,
   info->release_user_data = free_block_data;
 
   /* Arrays to store non-null rows and columns */
-  bdata->dof_row_used=(int*)MpfCalloc(row_count, sizeof(int)); CHKPTRA(bdata->dof_row_used);
-  bdata->dof_col_used=(int*)MpfCalloc(col_count, sizeof(int)); CHKPTRA(bdata->dof_col_used);
+  bdata->dof_row_used = MpfCalloc(row_count, sizeof(int)); CHKPTRA(bdata->dof_row_used);
+  bdata->dof_col_used = MpfCalloc(col_count, sizeof(int)); CHKPTRA(bdata->dof_col_used);
 
   /* precompute and store the coordinates of the points (rows and columns)
      and tag BEM rows and columns as non-null
@@ -285,7 +290,7 @@ advanced_compute_hmat(struct hmat_block_compute_context_t *b) {
     for (i = 0; i < b->row_count; ++i, dValues+=step, row_point++) { // Idem with dof_row_used[]
       int row = bdata->row_hmat2client ? bdata->row_hmat2client[i + b->row_start + bdata->row_start] : i + b->row_start + bdata->row_start;
       int row_g = row + myCtx->rowOffset;
-      Z_type Aij=Mpf_z_zero, Bij=Mpf_z_zero ;
+      double _Complex Aij = 0, Bij = 0;
 
       // BEM part
       if (row_g < nbPtsBEM && col_g < nbPtsBEM) {
@@ -294,19 +299,18 @@ advanced_compute_hmat(struct hmat_block_compute_context_t *b) {
 
       // FEM part
       if (testedModel == _fembem || testedModel == _fem) {
-        ierr=computeKernelFEM(row_g, col_g, &Bij); CHKERRA(ierr);
-        Aij.r += Bij.r;
-        Aij.i += Bij.i;
+        ierr=computeKernelFEM(row_g, col_g, (Z_type*)&Bij); CHKERRA(ierr);
+        Aij += Bij;
       }
 
       switch(stype) {
         case SIMPLE_PRECISION:
         case DOUBLE_PRECISION:
-          *((double*)dValues) = Aij.r;
+          *((double*)dValues) = Aij;
           break;
         case SIMPLE_COMPLEX:
         case DOUBLE_COMPLEX:
-          *((Z_type*)dValues) = Aij;
+          *(double _Complex*)dValues = Aij;
           break;
         default:
           SETERRA(1, "Unknown stype");
@@ -379,13 +383,12 @@ void computeDenseBlockFEMBEM_Cprec(int *LigInf, int *LigSup, int *ColInf, int *C
                                    int *___Inf, int *___Sup, int *iloc, int *jloc,
                                    int *tailleS, int *tailleL, void *bloc, void *context) {
   int id, ic, il, inxBloc ;
-  char *zbloc ;
   ASSERTA(context);
 
-  zbloc=(char*)MpfCalloc((size_t)(*tailleS)*(*___Sup - *___Inf), sizeof(double)*(complexALGO?2:1)); CHKPTRA(zbloc) ;
+  void * zbloc = MpfCalloc((size_t)(*tailleS)*(*___Sup - *___Inf), sizeof(double)*(complexALGO?2:1)); CHKPTRA(zbloc) ;
 
   /* Appelle la routine en double precision */
-  computeDenseBlockFEMBEM( LigInf, LigSup, ColInf, ColSup ,
+  computeDenseBlockFEMBEM( LigInf, LigSup, ColInf, ColSup,
                            ___Inf, ___Sup, iloc, jloc,
                            tailleS, tailleL, zbloc, context) ;
 
@@ -404,9 +407,9 @@ void computeDenseBlockFEMBEM_Cprec(int *LigInf, int *LigSup, int *ColInf, int *C
     }
   }
 
-  if (zbloc) MpfFree(zbloc) ; zbloc=NULL ;
+  MpfFree(zbloc);
 }
-/* ================================================================================== */
+
 #ifdef HAVE_HMAT
 HMAT_desc_t *HMAT_generate_matrix( hmat_interface_t *hi ) {
 
@@ -432,6 +435,10 @@ HMAT_desc_t *HMAT_generate_matrix( hmat_interface_t *hi ) {
   }
   else {
     clustering = hmat_create_clustering_median();
+    if(mpf_hmat_settings.max_leaf_size > 0) {
+      Mpf_printf(MPI_COMM_WORLD, "HMat maximum leaf size: %d\n", mpf_hmat_settings.max_leaf_size);
+      clustering = hmat_create_clustering_max_dof(clustering, mpf_hmat_settings.max_leaf_size);
+    }
     hmat_set_clustering_divider(clustering, divider);
     ct_builder = hmat_create_cluster_tree_builder( clustering );
   }
@@ -445,10 +452,15 @@ HMAT_desc_t *HMAT_generate_matrix( hmat_interface_t *hi ) {
   Mpf_printf(MPI_COMM_WORLD, "ClusterTree node count = %d\n", hmat_tree_nodes_count(cluster_tree));
 
   /* Create the H-matrix with this cluster tree and an admissibility criteria */
-  hmat_admissibility_param_t admissibility_param;
-  hmat_init_admissibility_param(&admissibility_param);
-  admissibility_param.eta = 3.0;
-  hmat_admissibility_t *admissibilityCondition = hmat_create_admissibility(&admissibility_param);
+  hmat_admissibility_t *admissibilityCondition;
+  if (use_hodlr)
+    admissibilityCondition = hmat_create_admissibility_hodlr();
+  else {
+    hmat_admissibility_param_t admissibility_param;
+    hmat_init_admissibility_param(&admissibility_param);
+    admissibility_param.eta = 3.0;
+    admissibilityCondition = hmat_create_admissibility(&admissibility_param);
+  }
 
   //  hmat_admissibility_t *admissibilityCondition = hmat_create_admissibility_standard(3.0);
   hmat_matrix_t* hmatrix = hi->create_empty_hmatrix_admissibility(cluster_tree, cluster_tree, symMatSolver, admissibilityCondition);
@@ -481,7 +493,7 @@ HMAT_desc_t *HMAT_generate_matrix( hmat_interface_t *hi ) {
     ctx.prepare = prepare_hmat;
     ctx.advanced_compute = advanced_compute_hmat;
   }
-  contextTestFEMBEM *myCtx = (contextTestFEMBEM*)MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRA(myCtx);
+  contextTestFEMBEM *myCtx = MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRA(myCtx);
   myCtx->colDim = nbPts;
   ctx.user_context = myCtx;
   hi->assemble_generic(hmatrix, &ctx);
@@ -506,9 +518,8 @@ void HMAT_destroy_matrix( hmat_interface_t *hi,
     hmat_delete_admissibility(hdesc->admissibilityCondition);
     hmat_delete_cluster_tree(hdesc->cluster_tree);
     hmat_delete_clustering(hdesc->clustering);
-    if (hdesc->myCtx) MpfFree(hdesc->myCtx) ; hdesc->myCtx=NULL ;
+    if (hdesc->myCtx) MpfFree(hdesc->myCtx);
     MpfFree(hdesc);
   }
 }
 #endif
-/* ================================================================================== */
diff --git a/src/kernel.c b/src/kernel.c
index 4ad5e8e74bde55bee462ac1673a814d19be4adfb..5415bd0574e3beb369f1eb6a672b2dc7b1b690f6 100644
--- a/src/kernel.c
+++ b/src/kernel.c
@@ -1,6 +1,6 @@
 #include "main.h"
 extern double meshStep;
-int computeKernelBEM(double *coord1, double *coord2, int self, Z_type *kernel) {
+int computeKernelBEM(double *coord1, double *coord2, int self, double complex *kernel) {
   double r, v[3], k=0. ;
 
   v[0]=coord1[0]-coord2[0] ;
@@ -11,22 +11,18 @@ int computeKernelBEM(double *coord1, double *coord2, int self, Z_type *kernel) {
 
   if (complexALGO) {
     k=2*M_PI/lambda;
-
-    kernel->r=cos(k*r)/(M_PI*4.*r) ;
-    kernel->i=sin(k*r)/(M_PI*4.*r) ;
+    *kernel = cexp(I * k * r) / (4 * M_PI * r);
   } else
-    kernel->r=1./(M_PI*4.*r) ;
+    *kernel = 1 / (4 * M_PI * r);
 
   if (symMatContent==0) { // non symmetric case
     // s = prod scal nu.r avec nu=(1,1,1)/sqrt(3)
     double s = (v[0]+v[1]+v[2])/1.7320508075688772;
     double fs=1+atan(s)/M_PI;
-    kernel->r *= fs;
-    kernel->i *= fs;
+    *kernel *= fs;
   }
   if (symMatContent==2 && self) { // symmetric positive definite case : add 'n.max(A)' times Identity
-    kernel->r *= (double)nbPts;
+    *kernel *= nbPts;
   }
   return 0;
 }
-
diff --git a/src/main.c b/src/main.c
index a9a1de37d2912eb452f8bf2cd340d4b4dcdaa179..0d6efdafd3ee245aa59e8fc3808d09191588d615 100644
--- a/src/main.c
+++ b/src/main.c
@@ -2,18 +2,67 @@
 
 #include "main.h"
 
+// instantiation of globales defined in main.h
+enum numModel testedModel = _bem;
+int nbRHS = 1;
+int sparseRHS = -1;
+int newRHS = 1;
+int nbPts = 16000;
+int tileSize = 1024;
+int nbPtsMain;
+int nbPtsBEM;
+void *rhs = NULL;
+void *solFMMfar = NULL;
+void *solCLAfar = NULL;
+void *solFMMnear = NULL;
+void *solCLAnear = NULL;
+int nbPtsPerLeaf = 50;
+int nbAlgoRuns = 1;
+char *sTYPE_ELEMENT_ARRAY_test_fembem[] = {NULL};
+int coupled = 0;
+int partial_fembem = 0;
+int verboseTest = 0;
+int use_ntiles_rec = 0;
+int check_result = 0;
+int generate_traces = 0;
+int useDetail;
+int computeNorm;
+int symMatContent = 1;
+int symMatSolver = 1;
+int simplePrec = 0;
+int complexALGO = 1;
+ScalarType stype = DOUBLE_COMPLEX;
+double lambda = -1.;
+double ptsPerLambda = 10.;
+double ptsPerLambdaDetail = -1.;
+double radiusLeaf = 0.;
+int writeMesh = 0;
+int writeMeshUNV = 0;
+enum algo testedAlgo = _undefined;
+int use_simple_assembly = 0;
+int divider = 2;
+int hmat_residual = 0;
+int hmat_refine = 0;
+int use_hmat_shuffle = 0;
+int divider_min = 2;
+int divider_max = 4;
+int use_hodlr = 0;
+size_t nnz_FEM = 0;
+ZnonZero *z_matComp_FEM = NULL;
+size_t *ptr_ordered_FEM = NULL;
+
 /*! \brief Main routine
   \return 0 for success
 */
 int main(int argc, char **argv) {
   int ierr ;
-  double step ;
+  double step, max_error = DBL_MAX;
 
+  MpfArgGetDouble(&argc, argv, 1, "--max-error", &max_error);
   if (MpfArgHasName(&argc, argv, 1, "-h") || MpfArgHasName(&argc, argv, 1, "--help") ) {
     ierr=printHelp() ;
     return ierr;
   }
-
   /* --- Choix de l'algo a tester (_undefined par defaut) --- */
   if (MpfArgHasName(&argc, argv, 1, "-gemvhmat") > 0) {
     testedAlgo = _gemvHMAT ;
@@ -136,7 +185,11 @@ int main(int argc, char **argv) {
     verboseTest=1 ;
     Mpf_printf(MPI_COMM_WORLD, "Mode verbose\n") ;
   }
-  if (MpfArgGetInt(&argc, argv, 1, "-nbpts", &nbPts)) {
+  double nbf;
+  if (MpfArgGetDouble(&argc, argv, 1, "-nbpts", &nbf)) {
+    if (nbf<0 || nbf>INT_MAX)
+      SETERRQ(1, "Incorrect value '-nbpts %.0f' (must be between 0 and %d)", nbf, INT_MAX);
+    nbPts = (int)nbf;
     Mpf_printf(MPI_COMM_WORLD, "Reading nbPts = %d\n", nbPts) ;
   }
   if (MpfArgHasName(&argc, argv, 1, "-ntilesrec") > 0) {
@@ -161,7 +214,12 @@ int main(int argc, char **argv) {
   /* --- Write a VTK file 'mesh.vtu' of the mesh --- */
   if (MpfArgHasName(&argc, argv, 1, "--write-mesh")) {
     writeMesh=1;
-    Mpf_printf(MPI_COMM_WORLD, "Write a VTK file 'mesh.vtu' of the mesh\n") ;
+    Mpf_printf(MPI_COMM_WORLD, "Activate writing of VTK file 'mesh.vtu'\n") ;
+  }
+  /* --- Write a UNV file 'mesh.unv' of the mesh --- */
+  if (MpfArgHasName(&argc, argv, 1, "--write-mesh-unv")) {
+    writeMeshUNV=1;
+    Mpf_printf(MPI_COMM_WORLD, "Activate writing of UNV file 'mesh.unv'\n") ;
   }
 
   switch(testedModel){
@@ -192,7 +250,6 @@ int main(int argc, char **argv) {
     Mpf_printf(MPI_COMM_WORLD, "Using OLD right-hand sides.\n");
   }
 
-
   /* Added detail on the main cylinder */
   if (MpfArgHasName(&argc, argv, 1, "-with_detail")) {
     useDetail = 1;
@@ -292,36 +349,44 @@ int main(int argc, char **argv) {
 
   /* Prepare the mesh and RHS used for the FMM/HMAT tests */
   ierr = prepareTEST() ; CHKERRQ(ierr) ;
-
+  double relative_error;
   /* Run the test */
   switch(testedAlgo) {
     case _gemvHMAT:
     case _solveHMAT:
     case _inverseHMAT:
-      ierr = testHMAT() ; CHKERRQ(ierr) ;
+      ierr = testHMAT(&relative_error) ; CHKERRQ(ierr) ;
       break;
     case _gemvCHAMELEON:
     case _solveCHAMELEON:
-      ierr = testCHAMELEON(); CHKERRQ(ierr);
+      ierr = testCHAMELEON(&relative_error); CHKERRQ(ierr);
       break;
     case _gemvHLIBPRO:
     case _solveHLIBPRO:
-      ierr = testHLIBPRO(); CHKERRQ(ierr);
+      ierr = testHLIBPRO(&relative_error); CHKERRQ(ierr);
       break;
     case _gemvHCHAMELEON:
     case _gemmHCHAMELEON:
     case _solveHCHAMELEON:
-      ierr = testHCHAMELEON(); CHKERRQ(ierr);
+      ierr = testHCHAMELEON(&relative_error); CHKERRQ(ierr);
       break;
     default:
       SETERRQ(1, "Unknown algorithm=%d", testedAlgo);
       break;
   }
-  
+  int error_exit = 0;
+  if(relative_error > max_error) {
+    error_exit = 1;
+    Mpf_printf(MPI_COMM_WORLD, "Error is too high (%g > %g), exiting with error.", relative_error, max_error);
+  }
   Mpf_printf(MPI_COMM_WORLD, "\ntest_FEMBEM : end of computation\n");
-  
-  ierr=SCAB_Exit(&argc, argv) ;
-  if (rhs) MpfFree(rhs) ; rhs=NULL;
 
-  return ierr ;
+  if (rhs) {
+    MpfFree(rhs);
+    rhs = NULL;
+  }
+
+  ierr=SCAB_Exit(&argc, argv); CHKERRQ(ierr);
+
+  return error_exit;
 }
diff --git a/src/pipe.c b/src/pipe.c
index 8e39cdc5a213d6565cfdd048bbbd505fc3f66397..b96e6f7fded6a816ab724a2278f467fa6c752c0f 100644
--- a/src/pipe.c
+++ b/src/pipe.c
@@ -1,7 +1,5 @@
 #include "main.h"
 
-/* ============================================================================================================ */
-
 extern double radius, height, meshStep;
 
 /*! \brief Number of point on one loop around the pipe */
@@ -14,7 +12,6 @@ static int n_z=0;
 /* routines de tri dans MAT/IMPLS/SPARSE/matutils_sparse.c */
 extern int compare_ZnonZero_Line(const void *pt1, const void *pt2) ;
 
-/* ============================================================================================================ */
 /*! \brief Reads command line arguments defining the shape of the cylinder
 
   \return 0 for success
@@ -37,7 +34,6 @@ int initPipe(int *argc, char ***argv) {
   return 0;
 }
 
-/* ============================================================================================================ */
 /*! \brief Computes coordinates of point number 'i'
 
   The points are computed on a pipe defined by 'radius' and 'height'. The number of points is given by nbPts.
@@ -61,7 +57,7 @@ int computeCoordPipe(int i, double *coord) {
        angleStep*n_l = 2.pi
        zStep=heigth/(n_z.n_l)
     */
-    double dn_z=pow(height*height*nbPts/M_PI/radius/radius, 1./3.);
+    double dn_z=pow(height*height*(double)nbPts/M_PI/radius/radius, 1./3.);
     double dn_r=radius*dn_z/2./height;
     double dn_l=4.*M_PI*dn_r;
     n_l=(int)ceil(dn_l);
@@ -98,17 +94,20 @@ int computeCoordPipe(int i, double *coord) {
 
   return 0;
 }
-/* ============================================================================================================ */
+
 double* createPipe(void) {
-  double* result = (double*) MpfCalloc(3 * nbPts, sizeof(double)); CHKPTRA(result);
+  double* result = MpfCalloc((size_t)3 * nbPts, sizeof(double)); CHKPTRA(result);
   int i, ierr;
 #pragma omp parallel for private(ierr) schedule(dynamic,10000)
   for (i = 0; i < nbPts; i++) {
-    ierr=computeCoordPipe(i, &(result[3*i]) ) ; CHKERRA(ierr) ;
+    ierr=computeCoordPipe(i, &(result[(size_t)3*i]) ) ; CHKERRA(ierr) ;
   }
   return result;
 }
-/* ============================================================================================================ */
+
+/*! \brief  allocated size of z_matComp_FEM[] */
+static size_t nbEl_FEM = 0;
+
 static int addValue(int lig, int col, double val) {
 
   // With option partial_fembem, we nullify two thirds of the interaction between BEM and FEM unknowns:
@@ -131,132 +130,229 @@ static int addValue(int lig, int col, double val) {
   nnz_FEM++;
   return 0;
 }
-/* ============================================================================================================ */
+
+int freeFemMatrix() {
+  if (z_matComp_FEM) MpfFree(z_matComp_FEM);
+  z_matComp_FEM=NULL;
+  if (ptr_ordered_FEM) MpfFree(ptr_ordered_FEM);
+  ptr_ordered_FEM=NULL;
+  nnz_FEM=0;
+  nbEl_FEM=0;
+  return 0;
+}
+
 int createFemMatrix() {
   int nbHexa=0, ierr, rank, iVtk;
   int defTetra[5][4]= { {0,1,2,4}, {1,4,5,7}, {1,2,3,7}, {2,4,6,7}, {1,2,4,7}};
-  FILE *fvtk=NULL;
+  FILE *fvtk=NULL, *funv=NULL;
+  int nbTetra=0, nbTria=0;
 
   ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  if (rank==0 && writeMesh) {
-    fvtk=fopen("mesh.vtu", "w");
-    // VTK Header
-    fprintf(fvtk, "<?xml version=\"1.0\"?>\n");
-    fprintf(fvtk, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" "
-                  "byte_order=\"LittleEndian\" compressor=\"vtkZLibDataCompressor\">\n");
-    fprintf(fvtk, "\t<UnstructuredGrid>\n");
-    fprintf(fvtk, "\t\t<Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
-            n_r*n_z*n_l, 5*(n_r*n_l*n_z-n_l*n_z-(n_r-1)*(n_l+1)));
-    // Write the vertices
-    fprintf(fvtk, "\t\t\t<Points>\n");
-    fprintf(fvtk, "\t\t\t\t<DataArray type=\"Float32\""
-                  " NumberOfComponents=\"3\" format=\"ascii\">\n");
-    double coord[3];
-    for (iVtk = 0; iVtk < n_r*n_z*n_l; iVtk++) {
-      ierr = computeCoordPipe(iVtk, coord); CHKERRQ(ierr);
-      fprintf(fvtk, "%g %g %g\n", (float) (coord[0]), (float)(coord[1]), (float)(coord[2]));
+  if (rank==0) {
+    if (writeMesh) {
+      fvtk=fopen("mesh.vtu", "w");
+      // VTK Header
+      fprintf(fvtk, "<?xml version=\"1.0\"?>\n");
+      fprintf(fvtk, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" "
+                    "byte_order=\"LittleEndian\" compressor=\"vtkZLibDataCompressor\">\n");
+      fprintf(fvtk, "\t<UnstructuredGrid>\n");
+      fprintf(fvtk, "\t\t<Piece NumberOfPoints=\"%d\" NumberOfCells=\"%d\">\n",
+              n_r*n_z*n_l, 5*(n_r*n_l*n_z-n_l*n_z-(n_r-1)*(n_l+1)));
+      // Write the vertices
+      fprintf(fvtk, "\t\t\t<Points>\n");
+      fprintf(fvtk, "\t\t\t\t<DataArray type=\"Float32\""
+                    " NumberOfComponents=\"3\" format=\"ascii\">\n");
+      double coord[3];
+      for (iVtk = 0; iVtk < n_r*n_z*n_l; iVtk++) {
+        ierr = computeCoordPipe(iVtk, coord); CHKERRQ(ierr);
+        fprintf(fvtk, "%g %g %g\n", (float) (coord[0]), (float)(coord[1]), (float)(coord[2]));
+      }
+      fprintf(fvtk, "\t\t\t\t</DataArray>\n");
+      fprintf(fvtk, "\t\t\t</Points>\n");
+      // VTK Elements
+      fprintf(fvtk, "\t\t\t<Cells>\n");
+      fprintf(fvtk, "\t\t\t\t<DataArray "
+                    "type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");
+    }
+    if (writeMeshUNV) {
+#define sNODE_UNV_ID "  2411"
+#define sELT_UNV_ID  "  2412"
+#define sGRP_UNV_ID  "  2435"
+#define sUNV_SEPARATOR    "    -1"
+#define sNODE_UNV_DESCR   "         1         1        11"
+#define sELT_TRIA3_DESC   "        91         1         1         5         3"
+#define sELT_TETRA4_DESC  "       111         1         2         9         4"
+
+      Mpf_printf(MPI_COMM_WORLD, "Writing 'mesh.unv'... ") ;
+      funv=fopen("mesh.unv", "w");
+      /* -------- */
+      /* Vertices */
+      /* -------- */
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      fprintf(funv, "%s\n", sNODE_UNV_ID );
+      double coord[3];
+      for(int i = 1; i<=n_r*n_z*n_l; i++ ) {
+        ierr = computeCoordPipe(i-1, coord); CHKERRQ(ierr);
+        fprintf(funv, "%10d%s\n", i, sNODE_UNV_DESCR );
+        fprintf(funv, "%25.16E%25.16E%25.16E\n", coord[0], coord[1], coord[2]) ;
+      }
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      /* -------- */
+      /* Elements */
+      /* -------- */
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      fprintf(funv, "%s\n", sELT_UNV_ID );
     }
-    fprintf(fvtk, "\t\t\t\t</DataArray>\n");
-    fprintf(fvtk, "\t\t\t</Points>\n");
-    // VTK Elements
-    fprintf(fvtk, "\t\t\t<Cells>\n");
-    fprintf(fvtk, "\t\t\t\t<DataArray "
-                  "type=\"Int32\" Name=\"connectivity\" format=\"ascii\">\n");
-  }
 
-  /* Loop on the hexaedre: We loop on the bottom-left-exterior corner of the hexaedre and check
+    /* Loop on the hexaedre: We loop on the bottom-left-exterior corner of the hexaedre and check
      that the 7 other vertices are valid.
      */
-  int i_hexa;
-  for (i_hexa=0 ; i_hexa<nbPts ; i_hexa++) {
-    div_t res=div(i_hexa, n_z*n_l);
+    int i_hexa;
+    for (i_hexa=0 ; i_hexa<nbPts ; i_hexa++) {
+      div_t res=div(i_hexa, n_z*n_l);
 
-    int i_helix = res.quot;
-    // the hexaedre is based on helices 'i_helix' and 'i_helix+1', so both must be in [0, n_r-1]
-    if (i_helix+1 > n_r-1) continue;
+      int i_helix = res.quot;
+      // the hexaedre is based on helices 'i_helix' and 'i_helix+1', so both must be in [0, n_r-1]
+      if (i_helix+1 > n_r-1) continue;
 
-    int i_face = res.rem;
-    // the outer face is based on points i_face, i_face+1, i_face+n_l, i_face+n_l+1, they must be in [0, n_z.n_l-1]
-    if (i_face+n_l+1 > n_z*n_l-1) continue;
+      int i_face = res.rem;
+      // the outer face is based on points i_face, i_face+1, i_face+n_l, i_face+n_l+1, they must be in [0, n_z.n_l-1]
+      if (i_face+n_l+1 > n_z*n_l-1) continue;
 
-    nbHexa++;
+      nbHexa++;
 
-    /* Vertices of the current hexaedre, in [0..nbPts[
-       Some of these value might exceed nbPts, they correspond to points that wont carry any unknown
-       and will be skipped during the loop on the unknowns below.
+      /* Vertices of the current hexaedre, in [0..nbPts[
     */
-    int i_vert[8];
-    i_vert[0] = i_helix * n_z*n_l + i_face; // actually it is i_hexa
-    i_vert[1] = i_helix * n_z*n_l + i_face+1;
-    i_vert[2] = i_helix * n_z*n_l + i_face+n_l;
-    i_vert[3] = i_helix * n_z*n_l + i_face+n_l+1;
-    i_vert[4] = (i_helix+1) * n_z*n_l + i_face;
-    i_vert[5] = (i_helix+1) * n_z*n_l + i_face+1;
-    i_vert[6] = (i_helix+1) * n_z*n_l + i_face+n_l;
-    i_vert[7] = (i_helix+1) * n_z*n_l + i_face+n_l+1;
-
-    /* Loop on the tetraedron inside this hexaedra (5 tetra per hexa, defTetra[i] gives the vertices
+      int i_vert[8];
+      i_vert[0] = i_helix * n_z*n_l + i_face; // actually it is i_hexa
+      i_vert[1] = i_helix * n_z*n_l + i_face+1;
+      i_vert[2] = i_helix * n_z*n_l + i_face+n_l;
+      i_vert[3] = i_helix * n_z*n_l + i_face+n_l+1;
+      i_vert[4] = (i_helix+1) * n_z*n_l + i_face;
+      i_vert[5] = (i_helix+1) * n_z*n_l + i_face+1;
+      i_vert[6] = (i_helix+1) * n_z*n_l + i_face+n_l;
+      i_vert[7] = (i_helix+1) * n_z*n_l + i_face+n_l+1;
+
+      /* Loop on the tetraedron inside this hexaedra (5 tetra per hexa, defTetra[i] gives the vertices
        composing each of them) */
-    int i_tetra;
-    for (i_tetra=0 ; i_tetra<5 ; i_tetra++) {
-      int i_vert_t[4]; // vertices of the current tetraedron, in [0..nbPts[
-      int i, j;
-      for (i=0 ; i<4 ; i++)
-        i_vert_t[i] = i_vert[defTetra[i_tetra][i]];
-
-      // Loop on the unkowns (=vertices) in this tetrahedron
-      // Each unknown interacts with the other unknown in the same tetrahedron
-      // Value of the interaction is 0.1 for self, 0.05 otherwise
-      for (i=0 ; i<4 ; i++)
-        if (i_vert_t[i] >= 0 && i_vert_t[i] < nbPts)
-          for (j=0 ; j<4 ; j++)
-            if (i_vert_t[j] >= 0 && i_vert_t[j] < nbPts)
-              addValue(i_vert_t[i], i_vert_t[j], i==j ? 0.1 : 0.05);
-
-      if (rank==0 && writeMesh)
-        fprintf(fvtk, "%d %d %d %d\n", i_vert_t[0], i_vert_t[1], i_vert_t[2], i_vert_t[3]);
-
+      int i_tetra;
+      for (i_tetra=0 ; i_tetra<5 ; i_tetra++) {
+        int i_vert_t[4]; // vertices of the current tetraedron, in [0..nbPts[
+        int i, j;
+        for (i=0 ; i<4 ; i++)
+          i_vert_t[i] = i_vert[defTetra[i_tetra][i]];
+
+        // Loop on the unkowns (=vertices) in this tetrahedron
+        // Each unknown interacts with the other unknown in the same tetrahedron
+        // Value of the interaction is 0.1 for self, 0.05 otherwise
+        for (i=0 ; i<4 ; i++)
+          if (i_vert_t[i] >= 0 && i_vert_t[i] < nbPts)
+            for (j=0 ; j<4 ; j++)
+              if (i_vert_t[j] >= 0 && i_vert_t[j] < nbPts)
+                addValue(i_vert_t[i], i_vert_t[j], i==j ? 0.1 : 0.05);
+
+        if (writeMesh)
+          fprintf(fvtk, "%d %d %d %d\n", i_vert_t[0], i_vert_t[1], i_vert_t[2], i_vert_t[3]);
+        if (writeMeshUNV) {
+          fprintf(funv, "%10d%s\n", ++nbTetra, sELT_TETRA4_DESC ) ;
+          fprintf(funv, "%10d%10d%10d%10d\n", i_vert_t[0]+1, i_vert_t[1]+1, i_vert_t[2]+1, i_vert_t[3]+1);
+        }
+      }
     }
-  }
 
-  if (rank==0 && writeMesh) {
-    // Check that the number of hexas created matches the one computed at the beginning and written in the vtk header
-    ASSERTQ(nbHexa == n_r*n_l*n_z-n_l*n_z-(n_r-1)*(n_l+1));
-    fprintf(fvtk, "\t\t\t\t</DataArray>\n");
-    /* Ecriture des offsets */
-    fprintf(fvtk, "\t\t\t\t<DataArray "
-                  "type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
-    for (iVtk = 0; iVtk < nbHexa*5; iVtk++) {
-      fprintf(fvtk, "%d ", iVtk*4+4);
-      if (iVtk % 10 == 9)
-        fprintf(fvtk, "\n");
+    if (writeMesh) {
+      // Check that the number of hexas created matches the one computed at the beginning and written in the vtk header
+      ASSERTQ(nbHexa == n_r*n_l*n_z-n_l*n_z-(n_r-1)*(n_l+1));
+      fprintf(fvtk, "\t\t\t\t</DataArray>\n");
+      /* Ecriture des offsets */
+      fprintf(fvtk, "\t\t\t\t<DataArray "
+                    "type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
+      for (iVtk = 0; iVtk < nbHexa*5; iVtk++) {
+        fprintf(fvtk, "%d ", iVtk*4+4);
+        if (iVtk % 10 == 9)
+          fprintf(fvtk, "\n");
+      }
+      fprintf(fvtk, "\t\t\t\t</DataArray>\n");
+
+      /* Ecriture des types d'elements */
+      fprintf(fvtk, "\t\t\t\t<DataArray "
+                    "type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
+      for (iVtk = 0; iVtk < nbHexa*5; iVtk++) {
+        fprintf(fvtk, "%d ", 10); // For VTK_TETRA
+        if (iVtk % 10 == 9)
+          fprintf(fvtk, "\n");
+      }
+      fprintf(fvtk, "\t\t\t\t</DataArray>\n");
+      fprintf(fvtk, "\t\t\t</Cells>\n");
+      fprintf(fvtk, "\t\t</Piece>\n\t</UnstructuredGrid>\n</VTKFile>\n");
+      fclose(fvtk);
+      Mpf_printf(MPI_COMM_WORLD, "Done writing 'mesh.vtu'\n") ;
     }
-    fprintf(fvtk, "\t\t\t\t</DataArray>\n");
-
-    /* Ecriture des types d'elements */
-    fprintf(fvtk, "\t\t\t\t<DataArray "
-                  "type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
-    for (iVtk = 0; iVtk < nbHexa*5; iVtk++) {
-      fprintf(fvtk, "%d ", 10); // For VTK_TETRA
-      if (iVtk % 10 == 9)
-        fprintf(fvtk, "\n");
+    if (writeMeshUNV) {
+      // Add the triangles on the outer helix
+      for (int i_face=0 ; i_face<n_z*n_l ; i_face++) {
+        // the outer face is based on points i_face, i_face+1, i_face+n_l, i_face+n_l+1, they must be in [0, n_z.n_l-1]
+        if (i_face+n_l+1 > n_z*n_l-1) continue;
+
+        /* Vertices of the current outer triangles in [0..n_z*n_l[
+    */
+        int i_vert[4];
+        i_vert[0] = i_face;
+        i_vert[1] = i_face+1;
+        i_vert[2] = i_face+n_l;
+        i_vert[3] = i_face+n_l+1;
+        // Add the 2 triangles using vertices 0,1,2 and 2,1,3 (inward normal)
+        fprintf(funv, "%10d%s\n", ++nbTria + nbTetra, sELT_TRIA3_DESC ) ;
+        fprintf(funv, "%10d%10d%10d\n", i_vert[0]+1, i_vert[1]+1, i_vert[2]+1);
+        fprintf(funv, "%10d%s\n", ++nbTria + nbTetra, sELT_TRIA3_DESC ) ;
+        fprintf(funv, "%10d%10d%10d\n", i_vert[2]+1, i_vert[1]+1, i_vert[3]+1);
+      }
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      /* ------ */
+      /* Groups */
+      /* ------ */
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      fprintf(funv, "%s\n", sGRP_UNV_ID );
+      /* Group V_D1 for all the tetrahedra */
+      fprintf(funv, "%10d%10d%10d%10d%10d%10d%10d%10d\n", 1, 0, 0, 0, 0, 0, 0, nbTetra) ;
+      fprintf(funv, "V_D1\n") ;
+      for (int i=1 ; i<=nbTetra ; i++) {
+        fprintf(funv, "%10d%10d%10d%10d", 8, i, 0, 0) ;
+        if ( (i%2==0) || (i==nbTetra) )
+          fprintf(funv, "\n") ;
+      }
+      /* Group I_D1_EXT for all the triangles */
+      fprintf(funv, "%10d%10d%10d%10d%10d%10d%10d%10d\n", 1, 0, 0, 0, 0, 0, 0, nbTria) ;
+      fprintf(funv, "I_D1_EXT\n") ;
+      for (int i=1 ; i<=nbTria ; i++) {
+        fprintf(funv, "%10d%10d%10d%10d", 8, i+nbTetra, 0, 0) ;
+        if ( (i%2==0) || (i==nbTria) )
+          fprintf(funv, "\n") ;
+      }
+      fprintf(funv, "%s\n",sUNV_SEPARATOR ) ;
+      fclose(funv);
+      Mpf_printf(MPI_COMM_WORLD, "Done.\n") ;
     }
-    fprintf(fvtk, "\t\t\t\t</DataArray>\n");
-    fprintf(fvtk, "\t\t\t</Cells>\n");
-    fprintf(fvtk, "\t\t</Piece>\n\t</UnstructuredGrid>\n</VTKFile>\n");
-    fclose(fvtk);
+
+    Mpf_printf(MPI_COMM_WORLD, "Number of hexaedres = %d\n", nbHexa);
+    Mpf_printf(MPI_COMM_WORLD, "Number of NNZ before sort = %zu\n", nnz_FEM);
+    // Passer en multithread ?
+    ierr=Z_sortAndCompactL(z_matComp_FEM, &nnz_FEM) ; CHKERRQ(ierr) ;
+    z_matComp_FEM=(ZnonZero*)MpfRealloc(z_matComp_FEM,nnz_FEM*sizeof(ZnonZero)); CHKPTRQ(z_matComp_FEM) ;
+    Mpf_printf(MPI_COMM_WORLD, "Number of NNZ after sort = %zu\n", nnz_FEM);
   }
 
-  Mpf_printf(MPI_COMM_WORLD, "Number of hexaedres = %d\n", nbHexa);
-  Mpf_printf(MPI_COMM_WORLD, "Number of NNZ before sort = %zu\n", nnz_FEM);
-  // Passer en multithread ?
-  ierr=Z_sortAndCompactL(z_matComp_FEM, &nnz_FEM) ; CHKERRQ(ierr) ;
-  z_matComp_FEM=(ZnonZero*)MpfRealloc(z_matComp_FEM,nnz_FEM*sizeof(ZnonZero)); CHKPTRQ(z_matComp_FEM) ;
-  Mpf_printf(MPI_COMM_WORLD, "Number of NNZ after sort = %zu\n", nnz_FEM);
+  // we broadcast the mesh in MPI : z_matComp_FEM, nnz_FEM.
+  ierr = MPI_Bcast(&nnz_FEM, sizeof(size_t), MPI_BYTE, 0, MPI_COMM_WORLD); CHKERRQ(ierr);
+  if (rank) {
+    z_matComp_FEM=(ZnonZero*)MpfRealloc(z_matComp_FEM,nnz_FEM*sizeof(ZnonZero)); CHKPTRQ(z_matComp_FEM) ;
+  }
+  for (size_t off=0 ; off<nnz_FEM*sizeof(ZnonZero) ; off += 1e9) {
+    ierr = MPI_Bcast((char*)z_matComp_FEM + off, MIN(nnz_FEM*sizeof(ZnonZero)-off, 1e9), MPI_BYTE, 0, MPI_COMM_WORLD); CHKERRQ(ierr);
+  }
 
   /* Store in ptr_ordered_FEM the beginning of each line in z_matComp */
-  ptr_ordered_FEM = (size_t*)MpfCalloc(nbPts+1,sizeof(size_t)); CHKPTRQ(ptr_ordered_FEM);
-
+  ptr_ordered_FEM = MpfCalloc(nbPts+1,sizeof(size_t)); CHKPTRQ(ptr_ordered_FEM);
   size_t i, k=0 ;
   for (i=0;i<nnz_FEM;i++)
     while  (k <= z_matComp_FEM[i].i ) {
@@ -268,12 +364,15 @@ int createFemMatrix() {
     k++ ;
   }
 
+  // In case we pass a second time in this routine
+  writeMesh = 0;
+  writeMeshUNV = 0;
+
   return 0;
 }
 
-/* ============================================================================================================ */
 int computeKernelFEM(int i, int j, Z_type *kernel) {
-  ZnonZero z_cle={i, j, {0.}};
+  ZnonZero z_cle={i, j, { 0.}};
   ASSERTA(z_matComp_FEM);
   // Line 'i' begins at position ptr_ordered_FEM[i] and has size ptr_ordered_FEM[i+1]-ptr_ordered_FEM[i]
   ZnonZero *z_res = (ZnonZero*)bsearch(&z_cle, z_matComp_FEM+ptr_ordered_FEM[i], ptr_ordered_FEM[i+1]-ptr_ordered_FEM[i], sizeof(ZnonZero), compare_ZnonZero_Line);
@@ -284,7 +383,6 @@ int computeKernelFEM(int i, int j, Z_type *kernel) {
   return 0;
 }
 
-/* ============================================================================================================ */
 int produitClassiqueFEM(void *sol, MPI_Comm myComm) {
   int ierr, nbProcs, myProc;
   size_t l, jg;
@@ -309,13 +407,6 @@ int produitClassiqueFEM(void *sol, MPI_Comm myComm) {
     size_t i, j, k;
     Z_type Aij;
 
-#ifdef _OPENMP
-    if (omp_get_thread_num()==0)
-#endif
-    {
-      ierr=Mpf_progressBar(myComm, (int)(jg/100)+1, (int)(nnz_FEM/100)+2) ;
-    }
-
     if (z_matComp_FEM[l].j % sparseRHS == 0) {
       Aij=z_matComp_FEM[l].v;
       i=z_matComp_FEM[l].i;
@@ -323,18 +414,25 @@ int produitClassiqueFEM(void *sol, MPI_Comm myComm) {
       for (k=0 ; k<nbRHS ; k++) {
         if (complexALGO) {
 #pragma omp atomic
-          z_sol[i+k*nbPts].r += Aij.r * z_rhs[j+k*nbPts].r - Aij.i * z_rhs[j+k*nbPts].i ;
+          z_sol[(size_t)i+k*nbPts].r += Aij.r * z_rhs[(size_t)j+k*nbPts].r - Aij.i * z_rhs[(size_t)j+k*nbPts].i ;
 #pragma omp atomic
-          z_sol[i+k*nbPts].i += Aij.r * z_rhs[j+k*nbPts].i + Aij.i * z_rhs[j+k*nbPts].r ;
+          z_sol[(size_t)i+k*nbPts].i += Aij.r * z_rhs[(size_t)j+k*nbPts].i + Aij.i * z_rhs[(size_t)j+k*nbPts].r ;
         } else {
 #pragma omp atomic
-          d_sol[i+k*nbPts] += Aij.r * d_rhs[j+k*nbPts] ;
+          d_sol[(size_t)i+k*nbPts] += Aij.r * d_rhs[(size_t)j+k*nbPts] ;
         }
       } /* for (k=0 ... */
     } /* if (z_matComp[l].j ... */
 
-#pragma omp atomic
-    jg++ ;
+    size_t jg_bak;
+#pragma omp atomic capture
+    jg_bak = ++jg ;
+#ifdef _OPENMP
+    if (omp_get_thread_num()==0)
+#endif
+    {
+      ierr=Mpf_progressBar(myComm, (int)(jg_bak/100)+1, (int)(nnz_FEM/100)+2) ;
+    }
   }
 
   ierr=Mpf_progressBar(myComm, (int)(nnz_FEM/100)+2, (int)(nnz_FEM/100)+2) ;
@@ -343,7 +441,7 @@ int produitClassiqueFEM(void *sol, MPI_Comm myComm) {
 
   return 0;
 }
-/* ============================================================================================================ */
+
 /*! \brief Computes a block for the matrix of FEM interactions.
 
   Row and column indices are 1-based and included.
@@ -456,7 +554,7 @@ void computeSparseBlockFEMBEM ( int *LigInf, int *LigSup, int *ColInf, int *ColS
       for (row=LigInf_g-1 ; row<=LigSup_g-1 && row<nbPtsBEM; row++) {
         ierr=computeCoordCylinder(row, &(coord_i[0])); CHKERRA(ierr);
         for (col=ColInf_g-1 ; col<=ColSup_g-1 && col<nbPtsBEM; col++) {
-          Z_type Aij=Mpf_z_zero;
+          double _Complex Aij = 0;
           ierr=computeCoordCylinder(col, &(coord_j[0])); CHKERRA(ierr);
           ierr=computeKernelBEM(&(coord_i[0]), &(coord_j[0]), row==col, &Aij); CHKERRA(ierr);
 
@@ -467,26 +565,25 @@ void computeSparseBlockFEMBEM ( int *LigInf, int *LigSup, int *ColInf, int *ColS
             case SIMPLE_PRECISION:
               s_matComp->i=i;
               s_matComp->j=j;
-              s_matComp->v=(S_type)(Aij.r);
+              s_matComp->v = Aij;
               s_matComp++;
               break;
             case DOUBLE_PRECISION:
               d_matComp->i=i;
               d_matComp->j=j;
-              d_matComp->v=(D_type)(Aij.r);
+              d_matComp->v = Aij;
               d_matComp++;
               break;
             case SIMPLE_COMPLEX:
               c_matComp->i=i;
               c_matComp->j=j;
-              c_matComp->v.r=(S_type)(Aij.r);
-              c_matComp->v.i=(S_type)(Aij.i);
+              c_matComp->cv = Aij;
               c_matComp++;
               break;
             case DOUBLE_COMPLEX:
               z_matComp->i=i;
               z_matComp->j=j;
-              z_matComp->v=Aij;
+              z_matComp->cv = Aij;
               z_matComp++;
               break;
             default:
diff --git a/src/prepareTEST.c b/src/prepareTEST.c
index b11142f271d2505cec81774349b22fc2239bc951..4c7950d72e2a3005d039fb5eb1b3f585fd4ca793 100644
--- a/src/prepareTEST.c
+++ b/src/prepareTEST.c
@@ -21,7 +21,7 @@ int prepareTEST(void) {
 
   /* Cree le second membre du produit mat-vec */
   Mpf_printf(MPI_COMM_WORLD, "Computing RHS...\n") ;
-  rhs=(void*)MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ;  CHKPTRQ(rhs) ;
+  rhs = MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ;  CHKPTRQ(rhs) ;
   ierr = computeRhs() ; CHKERRQ(ierr) ;
   if (verboseTest) {
     ierr = displayArray("rhs", rhs, nbPts, nbRHS) ; CHKERRQ(ierr) ;
@@ -43,9 +43,9 @@ int displayArray(char *s, void *f, int m, int n) {
   for (j=0 ; j<n ; j++)
     for (i=0 ; i<m ; i++)
       if (complexALGO)
-        Mpf_printf(MPI_COMM_WORLD, "%s[%d, %d]=(%e, %e)\n", s, i, j, z_f[i+j*m].r, z_f[i+j*m].i) ;
+        Mpf_printf(MPI_COMM_WORLD, "%s[%d, %d]=(%e, %e)\n", s, i, j, z_f[(size_t)j*m+i].r, z_f[(size_t)j*m+i].i) ;
       else
-        Mpf_printf(MPI_COMM_WORLD, "%s[%d, %d]=%e\n", s, i, j, d_f[i+j*m]) ;
+        Mpf_printf(MPI_COMM_WORLD, "%s[%d, %d]=%e\n", s, i, j, d_f[(size_t)j*m+i]) ;
   return 0 ;
 }
 
diff --git a/src/rhs.c b/src/rhs.c
index b2c2a60f3d5e29b3b02f3656aaa01a906714ce89..7b9716ad077a393be98d1146eb8f4762b2b25a4f 100644
--- a/src/rhs.c
+++ b/src/rhs.c
@@ -5,7 +5,7 @@ extern double height;
 int computeRhs(void) {
   double coord[3], OP[3] ;
   int i, j, ierr ;
-  Z_type *z_rhs=rhs ;
+  double complex *z_rhs=rhs ;
   D_type *d_rhs=rhs ;
   double small = simplePrec ? 1.e-20 : 1.e-100;
   double k=2.*M_PI/lambda;
@@ -19,21 +19,21 @@ int computeRhs(void) {
         if (newRHS) {
           if (complexALGO) {
             // OP = fake incident plane wave coming from direction rotating around the object
-            ierr = computeCoord( j*nbPts/nbRHS%nbPts , &(OP[0])) ; CHKERRQ(ierr);
+            ierr = computeCoord( (size_t)j*nbPts/nbRHS%nbPts , &(OP[0])) ; CHKERRQ(ierr);
             OP[2] -= height/2.; // to have plane waves coming from below too
-            double k_ps = k*(coord[0]*OP[0]+coord[1]*OP[1]+coord[2]*OP[2]);
-            z_rhs[i+j*nbPts].r=cos(k_ps);
-            z_rhs[i+j*nbPts].i=sin(k_ps);
+            z_rhs[(size_t)j * nbPts + i] = cexp(
+              I * k * (coord[0] * OP[0] + coord[1] * OP[1] + coord[2] * OP[2]));
           } else {
-            d_rhs[i+j*nbPts]=cos(coord[0]/(j+1)+coord[1]+sin(j)*coord[2])+sin(j) ;
+            d_rhs[(size_t)j*nbPts+i]=cos(coord[0]/(j+1)+coord[1]+sin(j)*coord[2])+sin(j) ;
           }
         } else {
           // Old way of computing RHS. They were actually always rank 2! Not suitable to really test mat-rhs...
           if (complexALGO) {
-            z_rhs[i+j*nbPts].r=cos(coord[0]+coord[1]+coord[2])+sin(j) ;
-            z_rhs[i+j*nbPts].i=coord[0]-coord[1]+coord[2]-cos(j) ;
+            z_rhs[(size_t)j * nbPts + i] =
+                cos(coord[0] + coord[1] + coord[2]) + sin(j) +
+                I * (coord[0] - coord[1] + coord[2] - cos(j));
           } else {
-            d_rhs[i+j*nbPts]=cos(coord[0]+coord[1]+coord[2])+sin(j) ;
+            d_rhs[(size_t)j*nbPts+i]=cos(coord[0]+coord[1]+coord[2])+sin(j) ;
           }
         }
       }
@@ -42,14 +42,13 @@ int computeRhs(void) {
       // instead of zero, otherwise run-time optimisations produce unrealistic timers with FMM)
       for (j=0 ; j<nbRHS ; j++) {
         if (complexALGO) {
-          z_rhs[i+j*nbPts].r=small ;
-          z_rhs[i+j*nbPts].i=small ;
+          z_rhs[(size_t)j * nbPts + i] = small + I * small;
         } else {
-          d_rhs[i+j*nbPts]=small ;
+          d_rhs[(size_t)j*nbPts+i]=small ;
         }
       }
     }
-  
+
   return 0 ;
 }
 
@@ -57,7 +56,8 @@ void computeDenseBlockData (int *LigInf, int *LigSup, int *ColInf, int *ColSup,
   int tailleL__  ;
   int iloc_____  ;
   int jloc_____  ;
-  int il,ic,inxBloc, inxGlob ;
+  int il,ic,inxBloc;
+  size_t inxGlob ;
   S_type *s_bloc=bloc ;
   D_type *d_bloc=bloc ;
   C_type *c_bloc=bloc ;
@@ -74,7 +74,7 @@ void computeDenseBlockData (int *LigInf, int *LigSup, int *ColInf, int *ColSup,
   for (ic = ColInf[0]-1; ic < ColSup[0] ; ic++) {
     for (il = LigInf[0]-1; il < LigSup[0] ; il++) {
       inxBloc = tailleL__*(ic-ColInf[0]+1)  + il-LigInf[0]+1  + iloc_____ + jloc_____ * tailleL__ ;
-      inxGlob = (ic + myCtx->colOffset) * nbPts + (il  + myCtx->rowOffset);
+      inxGlob = (size_t)(ic + myCtx->colOffset) * nbPts + (il  + myCtx->rowOffset);
 
       switch (stype) {
         case SIMPLE_PRECISION:
diff --git a/src/testCHAMELEON.c b/src/testCHAMELEON.c
index 6314823cbf17811a5494ada5ab3f42bbac67990e..cd0c8588543a048d74b94b29819b123d9ac675b9 100644
--- a/src/testCHAMELEON.c
+++ b/src/testCHAMELEON.c
@@ -1,25 +1,22 @@
 #include "main.h"
 
-/* ============================================================================================================ */
 /*! \brief Runs the test using CHAMELEON solver
  */
-int testCHAMELEON(void) {
+int testCHAMELEON(double * relative_error) {
 #ifdef HAVE_CHAMELEON
   int ierr;
-  double temps_initial, temps_final, temps_cpu, eps;
+  double temps_initial, temps_final, temps_cpu;
   size_t vectorSize = (size_t)nbPts * nbRHS * (complexALGO ? 2 : 1);
 
-  /* ===================================================================================== */
   /* Realise le produit matrice vecteur classique : A.rhs -> solCLA */
-  /* ===================================================================================== */
+
   Mpf_printf(MPI_COMM_WORLD, "\n**** Computing Classical product...\n") ;
   void *solCLA=NULL;
   ierr = produitClassique(&solCLA, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
 
-  /* ===================================================================================== */
   /* Cree la Matrice CHAMELEON */
-  /* ===================================================================================== */
-  testFEMBEM_initChameleon( stype );
+
+  testFEMBEM_initChameleon( (cham_flttype_t)stype );
 
   /* Get the default NB parameter */
   int NB ;
@@ -39,7 +36,7 @@ int testCHAMELEON(void) {
   temps_final = getTime ();
   temps_cpu = temps_final - temps_initial ;
   if ( verboseTest ) {
-    void *lapackA = (void*)MpfCalloc( nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
+    void *lapackA = MpfCalloc( (size_t)nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
 
     ierr = CHAMELEON_Desc2Lap( ChamUpperLower, descA, lapackA, nbPts ); CHKERRQ(ierr);
     ierr = displayArray("frA", lapackA, nbPts, nbPts) ; CHKERRQ(ierr) ;
@@ -48,15 +45,13 @@ int testCHAMELEON(void) {
   }
   Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuCHAMELEONCreation = %f \n", temps_cpu) ;
 
-  /* ===================================================================================== */
   /* Switch on the various CHAMELEON tests */
-  /* ===================================================================================== */
+
   switch(testedAlgo) {
     case _gemvCHAMELEON:
 
-      /* ===================================================================================== */
       /* Appelle le produit mat.vec : A.rhs -> solCHAM */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Computing CHAMELEON product...\n") ;
       temps_initial = getTime ();
 
@@ -91,7 +86,7 @@ int testCHAMELEON(void) {
       temps_cpu = temps_final - temps_initial ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuGEMVCHAMELEON = %f \n", temps_cpu) ;
 
-      void *solCHAM = (void*)MpfCalloc( vectorSize, sizeof(D_type) ); CHKPTRQ(solCHAM);
+      void *solCHAM = MpfCalloc( vectorSize, sizeof(D_type) ); CHKPTRQ(solCHAM);
 
       /* convert back to lapack format */
       ierr = CHAMELEON_Desc2Lap( ChamUpperLower, descY, solCHAM, nbPts); CHKERRQ(ierr);
@@ -99,6 +94,7 @@ int testCHAMELEON(void) {
       ierr = CHAMELEON_Desc_Destroy (&descX); CHKERRQ(ierr);
       ierr = CHAMELEON_Desc_Destroy (&descY); CHKERRQ(ierr);
 
+      ASSERTQ(nbPts < INT_MAX/nbRHS);
       ierr = MPI_Bcast(solCHAM, nbRHS*nbPts, Mpf_mpitype[stype], 0, MPI_COMM_WORLD ); CHKERRQ(ierr);
 
       if (simplePrec) {
@@ -111,17 +107,18 @@ int testCHAMELEON(void) {
       }
       /* Compare the two vectors solCLA and solCHAM */
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr = computeRelativeError( solCHAM, solCLA, &eps); CHKERRQ(ierr);
+      ierr = computeRelativeError( solCHAM, solCLA, relative_error); CHKERRQ(ierr);
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
-      if (solCHAM) MpfFree(solCHAM) ; solCHAM=NULL ;
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
+      if (solCHAM) MpfFree(solCHAM) ;
+      solCHAM=NULL ;
 
       break;
 
     case _solveCHAMELEON:
-      /* ===================================================================================== */
+
       /* Factorize the CHAMELEON Matrix */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n\n**** Factorizing CHAMELEON Mat...\n") ;
       if ( generate_traces ) {
         CHAMELEON_Enable( CHAMELEON_PROFILING_MODE );
@@ -138,7 +135,7 @@ int testCHAMELEON(void) {
         CHAMELEON_Disable( CHAMELEON_PROFILING_MODE );
       }
       if ( verboseTest ) {
-        void *lapackA = (void*)MpfCalloc( nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
+        void *lapackA = MpfCalloc( (size_t)nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
 
         ierr = CHAMELEON_Desc2Lap( ChamUpperLower, descA, lapackA, nbPts ); CHKERRQ(ierr);
         ierr = displayArray("chamA", lapackA, nbPts, nbPts) ; CHKERRQ(ierr) ;
@@ -171,6 +168,7 @@ int testCHAMELEON(void) {
       /* Convert back to lapack */
       ierr = CHAMELEON_Desc2Lap( ChamUpperLower, descB, solCLA, nbPts); CHKERRQ(ierr);
 
+      ASSERTQ(nbPts < INT_MAX/nbRHS);
       ierr = MPI_Bcast(solCLA, nbRHS*nbPts, Mpf_mpitype[stype], 0, MPI_COMM_WORLD ); CHKERRQ(ierr);
       if (simplePrec)
         simpleToDouble(solCLA, vectorSize);
@@ -183,19 +181,18 @@ int testCHAMELEON(void) {
 
       /* Compare the two vectors solCLA and rhs */
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solCLA, rhs, &eps) ; CHKERRQ(ierr) ;
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
+      ierr=computeRelativeError(solCLA, rhs, relative_error) ; CHKERRQ(ierr) ;
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
       break ;
     default:
       break;
   }
 
-  /* ===================================================================================== */
   /* Finish execution */
-  /* ===================================================================================== */
 
   ierr = CHAMELEON_Desc_Destroy (&descA); CHKERRQ(ierr);
-  if (solCLA) MpfFree(solCLA) ; solCLA=NULL ;
+  if (solCLA) MpfFree(solCLA) ;
+  solCLA=NULL ;
 
 #else
   Mpf_printf(MPI_COMM_WORLD, "No CHAMELEON support.\n") ;
@@ -203,4 +200,3 @@ int testCHAMELEON(void) {
 
   return 0;
 }
-/* ============================================================================================================ */
diff --git a/src/testHCHAMELEON.c b/src/testHCHAMELEON.c
index cfc81db79394ff25edf47490574e233ab9fc2cb6..2baefafc1c14d0302423e0c6cc6e14b041700628 100644
--- a/src/testHCHAMELEON.c
+++ b/src/testHCHAMELEON.c
@@ -13,10 +13,10 @@ int get_worker_id( void )
 
 /*! \brief Runs the test of H-chameleon matrices : gemv, solve
  */
-int testHCHAMELEON(void) {
+int testHCHAMELEON(double * relative_error) {
 #if defined(HAVE_CHAMELEON) && defined(CHAMELEON_USE_HMAT)
   int ierr, rank, NB, np, dims[2] = { 0, 0 };
-  double temps_initial, temps_final, temps_cpu, eps;
+  double temps_initial, temps_final, temps_cpu;
   size_t vectorSize = (size_t)nbPts * nbRHS * (complexALGO ? 2 : 1);
 
   Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> HAcaAccuracy = %.4e \n", mpf_hmat_settings.acaEpsilon) ;
@@ -27,9 +27,8 @@ int testHCHAMELEON(void) {
 
   ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); CHKERRQ(ierr);
 
-  /* ===================================================================================== */
   /* Realise le produit matrice vecteur classique : A.rhs -> solCLA */
-  /* ===================================================================================== */
+
   void *solCLA = NULL;
   if ( check_result ) {
     Mpf_printf(MPI_COMM_WORLD, "\n**** Computing Classical product...\n") ;
@@ -37,10 +36,9 @@ int testHCHAMELEON(void) {
     ierr = produitClassique(&solCLA, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
   }
 
-  /* ===================================================================================== */
   /* Cree la H-Matrice */
-  /* ===================================================================================== */
-  testFEMBEM_initChameleon( stype );
+
+  testFEMBEM_initChameleon( (cham_flttype_t)stype );
   CHAMELEON_Enable( CHAMELEON_GENERIC ); // Summa is using lacpy that is not supported for hmat yet
 
   /* Get the default NB parameter */
@@ -86,7 +84,7 @@ int testHCHAMELEON(void) {
 
   if ( verboseTest ) {
     CHAM_desc_t *frA = HCHAMELEON_uncompress_matrix( &hdescA );
-    void *lapackA = (void*)MpfCalloc( nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
+    void *lapackA = MpfCalloc( (size_t)nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
 
     ierr = CHAMELEON_Desc2Lap( ChamUpperLower, frA, lapackA, nbPts ); CHKERRQ(ierr);
     ierr = displayArray("frA", lapackA, nbPts, nbPts) ; CHKERRQ(ierr) ;
@@ -111,14 +109,10 @@ int testHCHAMELEON(void) {
     Mpf_printf(MPI_COMM_WORLD, "Ratio: %g\n", (double)info.compressed_size/info.uncompressed_size);
   }
   /* Release the FEM matrix (can be quite large) */
-  if (z_matComp_FEM) MpfFree(z_matComp_FEM); z_matComp_FEM=NULL;
-  if (ptr_ordered_FEM) MpfFree(ptr_ordered_FEM); ptr_ordered_FEM=NULL;
-  nnz_FEM=0;
-  nbEl_FEM=0;
+  ierr=freeFemMatrix();CHKERRQ(ierr);
 
-  /* ===================================================================================== */
   /* Switch on the various H-CHAMELEON tests */
-  /* ===================================================================================== */
+
   switch(testedAlgo) {
     case _gemmHCHAMELEON:
 	{
@@ -154,15 +148,14 @@ int testHCHAMELEON(void) {
 
     case _gemvHCHAMELEON:
 
-      /* ===================================================================================== */
       /* Appelle le produit mat.vec : A.rhs -> solCHAM */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Computing H-CHAMELEON product...\n") ;
       void *solCHAM = NULL;
 
       temps_initial = getTime ();
       if ( rank == 0 ) {
-        solCHAM = (void*)MpfCalloc( vectorSize, sizeof(D_type) ); CHKPTRQ(solCHAM);
+        solCHAM = MpfCalloc( vectorSize, sizeof(D_type) ); CHKPTRQ(solCHAM);
 
         if ( simplePrec ) {
           doubleToSimple( rhs, vectorSize );
@@ -227,22 +220,22 @@ int testHCHAMELEON(void) {
         ierr = displayArray("solCHAM", solCHAM, nbPts, nbRHS) ; CHKERRQ(ierr) ;
       }
 
-      /* ===================================================================================== */
       /* Compare les 2 produits matrice-vecteur */
-      /* ===================================================================================== */
+
       if ( check_result ) {
         Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-        ierr = computeRelativeError( solCHAM, solCLA, &eps); CHKERRQ(ierr);
-        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
+        ierr = computeRelativeError( solCHAM, solCLA, relative_error); CHKERRQ(ierr);
+        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
       }
-      if ( solCHAM ) MpfFree( solCHAM ); solCHAM = NULL;
+      if ( solCHAM ) MpfFree( solCHAM );
+      solCHAM = NULL;
 
       break;
 
     case _solveHCHAMELEON:
-      /* ===================================================================================== */
+
       /* Factorize the H-matrix */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n\n**** Factorizing H-CHAMELEON...\n") ;
       if ( generate_traces ) {
         CHAMELEON_Enable( CHAMELEON_PROFILING_MODE );
@@ -262,7 +255,7 @@ int testHCHAMELEON(void) {
 
       if ( verboseTest ) {
         CHAM_desc_t *frA = HCHAMELEON_uncompress_matrix( &hdescA );
-        void *lapackA = (void*)MpfCalloc( nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
+        void *lapackA = MpfCalloc( (size_t)nbPts * nbPts, sizeof(D_type) ) ; CHKPTRQ(lapackA);
 
         ierr = CHAMELEON_Desc2Lap( ChamUpperLower, frA, lapackA, nbPts ); CHKERRQ(ierr);
         ierr = displayArray("hChamA", lapackA, nbPts, nbPts) ; CHKERRQ(ierr) ;
@@ -287,9 +280,8 @@ int testHCHAMELEON(void) {
         Mpf_printf(MPI_COMM_WORLD, "Ratio: %g\n", (double)info.compressed_size/info.uncompressed_size);
       }
 
-      /* ===================================================================================== */
       /* Solve the system : A-1.solCLA -> solCLA */
-      /* ===================================================================================== */
+
       if ( check_result ) {
         Mpf_printf(MPI_COMM_WORLD, "\n**** Solve H-CHAMELEON\n") ;
 
@@ -326,6 +318,7 @@ int testHCHAMELEON(void) {
         ierr = CHAMELEON_Desc2Lap( ChamUpperLower, descB, solCLA, nbPts); CHKERRQ(ierr);
         HCHAMELEON_lapmr( 'B', 'N', nbRHS, &hdescA, solCLA );
 
+        ASSERTQ(nbPts < INT_MAX/nbRHS);
         ierr = MPI_Bcast(solCLA, nbRHS*nbPts, Mpf_mpitype[stype], 0, MPI_COMM_WORLD ); CHKERRQ(ierr);
         if ( simplePrec ) {
           simpleToDouble( solCLA, vectorSize );
@@ -339,21 +332,20 @@ int testHCHAMELEON(void) {
 
         /* Compare the two vectors solCLA and rhs */
         Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-        ierr=computeRelativeError(solCLA, rhs, &eps) ; CHKERRQ(ierr) ;
-        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> ForwardError = %.4e \n", eps);
-        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> ScaledForwardError = %.4e \n", eps / mpf_hmat_settings.acaEpsilon );
+        ierr=computeRelativeError(solCLA, rhs, relative_error) ; CHKERRQ(ierr) ;
+        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> ForwardError = %.4e \n", *relative_error);
+        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> ScaledForwardError = %.4e \n", *relative_error / mpf_hmat_settings.acaEpsilon );
       }
       break ;
     default:
       break;
   }
 
-  /* ===================================================================================== */
   /* Finish execution */
-  /* ===================================================================================== */
 
   HCHAMELEON_destroy_matrix( &hdescA );
-  if (solCLA) MpfFree(solCLA) ; solCLA=NULL ;
+  if (solCLA) MpfFree(solCLA) ;
+  solCLA=NULL ;
   hmat_tracing_dump("testHCHAMELEON_trace.json");
 
 #else
@@ -367,4 +359,3 @@ int testHCHAMELEON(void) {
 
   return 0;
 }
-/* ============================================================================================================ */
diff --git a/src/testHLIBPRO.c b/src/testHLIBPRO.c
index 8889554fedc3f2ed93ad71af5287423631a5ae60..d33c6406ae5fcff2a29e446ed3157799e04d5cf2 100644
--- a/src/testHLIBPRO.c
+++ b/src/testHLIBPRO.c
@@ -7,14 +7,13 @@
 { char buf[1024]; hlib_error_desc( buf, 1024 ); \
   printf( "\n%s\n\n", buf ); exit(1); } }
 
-/* ============================================================================================================ */
 static double assemblyEpsilon=1e-4;
 static double recompressionEpsilon=1e-4;
 static hlib_lrapx_t compressionMethod=HLIB_LRAPX_ACAPLUS;
 static int maxLeafSize=100;
 static  hmat_factorization_t factorization_type=hmat_factorization_ldlt;
 static int nbCPUs=12;
-/* ============================================================================================================ */
+
 static void ckernelHLIBpro ( const size_t n, const int *rowidx, const size_t m, const int *colidx,
                              hlib_complex_t *matrix, void *user_context ) {
   size_t  rowi, colj;
@@ -26,7 +25,7 @@ static void ckernelHLIBpro ( const size_t n, const int *rowidx, const size_t m,
     }
   }
 }
-/* ============================================================================================================ */
+
 static void kernelHLIBpro ( const size_t n, const int *rowidx, const size_t m, const int *colidx,
                             hlib_real_t *matrix, void *user_context ) {
   size_t  rowi, colj;
@@ -38,7 +37,7 @@ static void kernelHLIBpro ( const size_t n, const int *rowidx, const size_t m, c
     }
   }
 }
-/* ============================================================================================================ */
+
 /*! \brief Init parameters for HLIBpro solver
    */
 int initHLIBPRO(int* argc, char*** argv) {
@@ -94,12 +93,12 @@ int initHLIBPRO(int* argc, char*** argv) {
 
   return 0;
 }
-/* ============================================================================================================ */
+
 /*! \brief Runs the test using HLIBPRO solver
    */
-int testHLIBPRO(void) {
+int testHLIBPRO(double * relative_error) {
   int ierr, rank, sSize ;
-  double temps_initial, temps_final, temps_cpu, eps ;
+  double temps_initial, temps_final, temps_cpu;
 
   Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> HAssemblyAccuracy = %.4e \n", assemblyEpsilon) ;
   Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> HAssemblyRecompression = %.4e \n", recompressionEpsilon) ;
@@ -116,16 +115,14 @@ int testHLIBPRO(void) {
 
   ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); CHKERRQ(ierr);
 
-  /* ===================================================================================== */
   /* Realise le produit matrice vecteur classique : A.rhs -> solCLA */
-  /* ===================================================================================== */
+
   Mpf_printf(MPI_COMM_WORLD, "\n**** Computing Classical product...\n") ;
   void *solCLA=NULL;
   ierr = produitClassique(&solCLA, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
 
-  /* ===================================================================================== */
   /* Cree la H-Matrice */
-  /* ===================================================================================== */
+
   temps_initial = getTime ();
   Mpf_printf(MPI_COMM_WORLD, "\n**** Creating HLIBpro Matrix...\n") ;
 
@@ -149,9 +146,9 @@ int testHLIBPRO(void) {
 
   double *points = testedModel==_bem ? createCylinder() : createPipe();
   // HLIBpro demands an array of pointers for the vertices
-  double**vertices = (double**) MpfCalloc( (size_t)nbPts, sizeof(double*) ); CHKPTRQ(vertices);
+  double**vertices = MpfCalloc( (size_t)nbPts, sizeof(double*) ); CHKPTRQ(vertices);
   for ( i = 0; i < nbPts; i++ )
-    vertices[i] = &points[3*i];
+    vertices[i] = &points[(size_t)3*i];
   hlib_coord_t coord = hlib_coord_import( (size_t)nbPts, 3, vertices, NULL, & info ); CHECK_INFO;
   hlib_clustertree_t ct = hlib_clt_build_bsp( coord, HLIB_BSP_CARD_MAX, maxLeafSize, & info ); CHECK_INFO;
   hlib_clt_print_ps( ct, "testHLIBpro_ct", & info ); CHECK_INFO;
@@ -163,7 +160,7 @@ int testHLIBPRO(void) {
 
   Mpf_printf(MPI_COMM_WORLD, "HLIBpro: building BEM matrix using ACA+\n" );
   hlib_acc_t acc = hlib_acc_fixed_eps( assemblyEpsilon );
-  contextTestFEMBEM *myCtx = (contextTestFEMBEM*)MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRQ(myCtx);
+  contextTestFEMBEM *myCtx = MpfCalloc(1, sizeof(contextTestFEMBEM)) ; CHKPTRQ(myCtx);
   hlib_matrix_t hmatrix = stype==DOUBLE_COMPLEX ?
         hlib_matrix_build_ccoeff( bct, ckernelHLIBpro, (void*)myCtx, compressionMethod, acc, symMatSolver, & info ) :
         hlib_matrix_build_coeff( bct, kernelHLIBpro, (void*)myCtx, compressionMethod, acc, symMatSolver, & info ); CHECK_INFO;
@@ -192,25 +189,22 @@ int testHLIBPRO(void) {
   hlib_vector_t vec_in = stype==DOUBLE_COMPLEX ? hlib_vector_cbuild( nbPts, & info ) : hlib_vector_build( nbPts, & info ); CHECK_INFO;
   hlib_vector_t vec_out= stype==DOUBLE_COMPLEX ? hlib_vector_cbuild( nbPts, & info ) : hlib_vector_build( nbPts, & info ); CHECK_INFO;
 
-  /* ===================================================================================== */
   /* Switch on the various HLIBpro tests */
-  /* ===================================================================================== */
+
   switch(testedAlgo) {
     case _gemvHLIBPRO:
 
-      /* ===================================================================================== */
       /* Appelle le produit mat.vec : A.rhs -> solHLIBpro */
-      /* ===================================================================================== */
 
       Mpf_printf(MPI_COMM_WORLD, "\n**** Computing HLIBpro product...\n") ;
-      void *solHLIBpro=(void*)MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ; CHKPTRQ(solHLIBpro) ;
+      void *solHLIBpro = MpfCalloc((size_t)nbPts*nbRHS, complexALGO ? sizeof(Z_type) : sizeof(D_type)) ; CHKPTRQ(solHLIBpro) ;
       temps_initial = getTime ();
 
       for (i=0 ; i<nbRHS ; i++) {
         /* import the data into vec_in */
         stype==DOUBLE_COMPLEX ?
-              hlib_vector_cimport( vec_in, (void*)((char*)rhs+i*nbPts*sSize), & info ) :
-              hlib_vector_import( vec_in, (void*)((char*)rhs+i*nbPts*sSize), & info ); CHECK_INFO;
+              hlib_vector_cimport( vec_in, (void*)((char*)rhs+(size_t)i*nbPts*sSize), & info ) :
+              hlib_vector_import( vec_in, (void*)((char*)rhs+(size_t)i*nbPts*sSize), & info ); CHECK_INFO;
         /* bring into H-ordering */
         hlib_vector_permute( vec_in, hlib_clt_perm_e2i( ct, & info ), & info ); CHECK_INFO;
         /* Do the GEMV */
@@ -219,12 +213,13 @@ int testHLIBPRO(void) {
         hlib_vector_permute( vec_out, hlib_clt_perm_i2e( ct, & info ), & info ); CHECK_INFO;
         /* export the data out of vec_out */
         stype==DOUBLE_COMPLEX ?
-              hlib_vector_cexport( vec_out, (void*)((char*)solHLIBpro+i*nbPts*sSize), & info ):
-              hlib_vector_export( vec_out, (void*)((char*)solHLIBpro+i*nbPts*sSize), & info ); CHECK_INFO;
+              hlib_vector_cexport( vec_out, (void*)((char*)solHLIBpro+(size_t)i*nbPts*sSize), & info ):
+              hlib_vector_export( vec_out, (void*)((char*)solHLIBpro+(size_t)i*nbPts*sSize), & info ); CHECK_INFO;
       }
 
       if (rank) {
-        if (solHLIBpro) MpfFree(solHLIBpro) ; solHLIBpro=NULL ;
+        if (solHLIBpro) MpfFree(solHLIBpro) ;
+        solHLIBpro=NULL ;
       }
       temps_final = getTime ();
       temps_cpu = (temps_final - temps_initial) ;
@@ -233,19 +228,19 @@ int testHLIBPRO(void) {
         ierr = displayArray("solHLIBpro", solHLIBpro, nbPts, nbRHS) ; CHKERRQ(ierr) ;
       }
 
-      /* ===================================================================================== */
       /* Compare les 2 produits matrice-vecteur */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solHLIBpro, solCLA, &eps) ; CHKERRQ(ierr) ;
+      ierr=computeRelativeError(solHLIBpro, solCLA, relative_error); CHKERRQ(ierr);
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
-      if (solHLIBpro) MpfFree(solHLIBpro) ; solHLIBpro=NULL ;
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
+      if (solHLIBpro) MpfFree(solHLIBpro) ;
+      solHLIBpro=NULL ;
       break;
     case _solveHLIBPRO:
-      /* ===================================================================================== */
+
       /* Factorize the H-matrix */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Factorizing HLIBpro...\n") ;
       temps_initial = getTime ();
 
@@ -264,7 +259,6 @@ int testHLIBPRO(void) {
         default:break;
       }
 
-
       temps_final = getTime ();
       temps_cpu = (temps_final - temps_initial) ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuFacto = %f \n", temps_cpu) ;
@@ -284,17 +278,16 @@ int testHLIBPRO(void) {
       Mpf_printf(MPI_COMM_WORLD, "Uncompressed size: %ld\n", fullsize/sSize);
       Mpf_printf(MPI_COMM_WORLD, "Ratio: %g\n", (double)bytesize/fullsize);
 
-      /* ===================================================================================== */
       /* Solve the system : A-1.solCLA -> solCLA */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Solve HLIBpro...\n") ;
       temps_initial = getTime ();
 
       for (i=0 ; i<nbRHS ; i++) {
         /* import the data into vec_in */
         stype==DOUBLE_COMPLEX ?
-              hlib_vector_cimport( vec_in, (void*)((char*)solCLA+i*nbPts*sSize), & info ) :
-              hlib_vector_import( vec_in, (void*)((char*)solCLA+i*nbPts*sSize), & info ); CHECK_INFO;
+              hlib_vector_cimport( vec_in, (void*)((char*)solCLA+(size_t)i*nbPts*sSize), & info ) :
+              hlib_vector_import( vec_in, (void*)((char*)solCLA+(size_t)i*nbPts*sSize), & info ); CHECK_INFO;
         /* bring into H-ordering */
         hlib_vector_permute( vec_in, hlib_clt_perm_e2i( ct, & info ), & info ); CHECK_INFO;
         /* Do the Solve */
@@ -304,21 +297,20 @@ int testHLIBPRO(void) {
         hlib_vector_permute( vec_out, hlib_clt_perm_i2e( ct, & info ), & info ); CHECK_INFO;
         /* export the data out of vec_out */
         stype==DOUBLE_COMPLEX ?
-              hlib_vector_cexport( vec_out, (void*)((char*)solCLA+i*nbPts*sSize), & info ) :
-              hlib_vector_export( vec_out, (void*)((char*)solCLA+i*nbPts*sSize), & info ); CHECK_INFO;
+              hlib_vector_cexport( vec_out, (void*)((char*)solCLA+(size_t)i*nbPts*sSize), & info ) :
+              hlib_vector_export( vec_out, (void*)((char*)solCLA+(size_t)i*nbPts*sSize), & info ); CHECK_INFO;
       }
 
       temps_final = getTime ();
       temps_cpu = (temps_final - temps_initial) ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuSolve = %f \n", temps_cpu) ;
 
-      /* ===================================================================================== */
       /* Compare the two vectors solCLA and rhs */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solCLA, rhs, &eps) ; CHKERRQ(ierr) ;
+      ierr=computeRelativeError(solCLA, rhs, relative_error) ; CHKERRQ(ierr) ;
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
       break;
     default:
       break;
@@ -334,19 +326,21 @@ int testHLIBPRO(void) {
 
   MpfFree(points); points=NULL;
   MpfFree(vertices); vertices=NULL;
-  if (myCtx) MpfFree(myCtx) ; myCtx=NULL ;
+  if (myCtx) MpfFree(myCtx) ;
+  myCtx=NULL ;
 
   hlib_done( & info ); CHECK_INFO;
 
   return 0;
 }
-/* ============================================================================================================ */
+
 #else
 int initHLIBPRO(int* argc, char*** argv) {
   return 0;
 }
-int testHLIBPRO(void) {
+int testHLIBPRO(double* relative_error) {
   Mpf_printf(MPI_COMM_WORLD, "No HLIBpro support.\n") ;
+  *relative_error = 0;
   return 0;
 }
 #endif
diff --git a/src/testHMAT.c b/src/testHMAT.c
index 39792b5cbd92d7ffcfd887a7100a61de303e05b8..b3ff1d9b0f4a6c1cc0fdb22ad3f9fe938e286960 100644
--- a/src/testHMAT.c
+++ b/src/testHMAT.c
@@ -2,11 +2,11 @@
 
 /*! \brief Runs the test of H-matrices : gemv, solve
  */
-int testHMAT(void) {
+int testHMAT(double * relative_error) {
 #ifdef HAVE_HMAT
   enter_context("testHMAT()");
   int ierr, rank ;
-  double temps_initial, temps_final, temps_cpu, eps ;
+  double temps_initial, temps_final, temps_cpu;
   size_t vectorSize = (size_t)nbPts*nbRHS*(complexALGO?2:1);
 
   Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> HAcaAccuracy = %.4e \n", mpf_hmat_settings.acaEpsilon);
@@ -17,16 +17,14 @@ int testHMAT(void) {
 
   ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); CHKERRQ(ierr);
 
-  /* ===================================================================================== */
   /* Realise le produit matrice vecteur classique : A.rhs -> solCLA */
-  /* ===================================================================================== */
+
   Mpf_printf(MPI_COMM_WORLD, "\n**** Computing Classical product...\n") ;
   void *solCLA=NULL;
   ierr = produitClassique(&solCLA, MPI_COMM_WORLD) ; CHKERRQ(ierr) ;
 
-  /* ===================================================================================== */
   /* Cree la H-Matrice */
-  /* ===================================================================================== */
+
   temps_initial = getTime ();
   Mpf_printf(MPI_COMM_WORLD, "\n**** Creating HMAT...\n") ;
 
@@ -36,19 +34,28 @@ int testHMAT(void) {
   // hmat_factorization_none mean the user did not chose a factorization so we
   // must choose one for him
   if(mpf_hmat_settings.factorization_type == hmat_factorization_none) {
-    if(symMatSolver)
-      // use LDLT for real matrices
-      mpf_hmat_settings.factorization_type = complexALGO ? hmat_factorization_llt : hmat_factorization_ldlt;
-    else
-      mpf_hmat_settings.factorization_type = hmat_factorization_lu;
+    if (use_hodlr) {
+      if(symMatSolver)
+        mpf_hmat_settings.factorization_type = hmat_factorization_hodlrsym;
+      else
+        mpf_hmat_settings.factorization_type = hmat_factorization_hodlr;
+    } else {
+      if(symMatSolver)
+        // use LDLT for real matrices
+        mpf_hmat_settings.factorization_type = complexALGO ? hmat_factorization_llt : hmat_factorization_ldlt;
+      else
+        mpf_hmat_settings.factorization_type = hmat_factorization_lu;
+    }
   }
 
-  /* Check the coherency between the factorization chosen in MPF (with --hmat-lu/llt/ldlt)
+  /* Check the coherency between the factorization chosen in MPF (with --hmat-lu/llt/ldlt/hodlr/hodlr-sym)
      and the symmetry of the matrix (chosen with --sym/--nosym/--symcont) */
   if (testedAlgo==_solveHMAT)
     switch(mpf_hmat_settings.factorization_type){
+      case hmat_factorization_hodlrsym:
       case hmat_factorization_ldlt:
       case hmat_factorization_llt:ASSERTQ(symMatSolver); break;
+      // No 'hmat_factorization_hodlr' here, because it works on sym matrices
       case hmat_factorization_lu:ASSERTQ(!symMatSolver); break;
       default:break;
     }
@@ -109,22 +116,17 @@ int testHMAT(void) {
   }
 
   /* Release the FEM matrix (can be quite large) */
-  if (z_matComp_FEM) MpfFree(z_matComp_FEM); z_matComp_FEM=NULL;
-  if (ptr_ordered_FEM) MpfFree(ptr_ordered_FEM); ptr_ordered_FEM=NULL;
-  nnz_FEM=0;
-  nbEl_FEM=0;
+  ierr=freeFemMatrix();CHKERRQ(ierr);
 
-  /* ===================================================================================== */
   /* Switch on the various HMAT tests */
-  /* ===================================================================================== */
+
   switch(testedAlgo) {
     case _gemvHMAT:
 
-      /* ===================================================================================== */
       /* Appelle le produit mat.vec : A.rhs -> solHMAT */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Computing HMAT product...\n") ;
-      void *solHMAT=(void*)MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solHMAT) ;
+      void *solHMAT = MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solHMAT) ;
       temps_initial = getTime ();
 
       if (simplePrec)
@@ -139,7 +141,8 @@ int testHMAT(void) {
       }
 
       if (rank) {
-        if (solHMAT) MpfFree(solHMAT) ; solHMAT=NULL ;
+        if (solHMAT) MpfFree(solHMAT);
+        solHMAT=NULL;
       }
       temps_final = getTime ();
       temps_cpu = (temps_final - temps_initial) ;
@@ -148,14 +151,14 @@ int testHMAT(void) {
         ierr = displayArray("solHMAT", solHMAT, nbPts, nbRHS) ; CHKERRQ(ierr) ;
       }
 
-      /* ===================================================================================== */
       /* Compare les 2 produits matrice-vecteur */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solHMAT, solCLA, &eps) ; CHKERRQ(ierr) ;
+      ierr=computeRelativeError(solHMAT, solCLA, relative_error); CHKERRQ(ierr);
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
-      if (solHMAT) MpfFree(solHMAT) ; solHMAT=NULL ;
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
+      if (solHMAT) MpfFree(solHMAT);
+      solHMAT=NULL ;
       break;
 
     case _solveHMAT:
@@ -169,18 +172,17 @@ int testHMAT(void) {
       if (hmat_residual) {
         hmatrix_orig = hi->copy(hmatrix); // ideally, we do a copy with lower accuracy for factorisation
         /* Vector to duplicate solCLA */
-        solCLA_orig=(void*)MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA_orig) ;
+        solCLA_orig = MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA_orig) ;
         /* Norm of the originaal RHS */
         ierr = computeVecNorm(solCLA, &normOrig); CHKERRQ(ierr);
       }
       if (hmat_refine) {
         /* Vector to accumulate solCLA */
-        solCLA_sum=(void*)MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA_sum) ;
+        solCLA_sum = MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA_sum) ;
       }
 
-      /* ===================================================================================== */
       /* Factorize the H-matrix */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Factorizing HMAT...\n") ;
       temps_initial = getTime ();
 
@@ -189,7 +191,7 @@ int testHMAT(void) {
       ctx.progress = &mpf_hmat_settings.progress;
       ctx.progress->user_data = (void*)MPI_COMM_WORLD;
       ctx.factorization = mpf_hmat_settings.factorization_type;
-      hi->factorize_generic(hmatrix, &ctx);
+      ierr = hi->factorize_generic(hmatrix, &ctx); CHKERRQ(ierr);
 
       temps_final = getTime ();
       temps_cpu = (temps_final - temps_initial) ;
@@ -246,9 +248,8 @@ int testHMAT(void) {
         Mpf_printf(MPI_COMM_WORLD, "Ratio: %g\n", (double)info.compressed_size/info.uncompressed_size);
       }
 
-      /* ===================================================================================== */
       /* Solve the system : A-1.solCLA -> solCLA */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Solve HMAT%s...\n", hmat_refine ? " with Iterative Refinement" : "") ;
 
       if ( verboseTest ) {
@@ -267,7 +268,7 @@ _refineLoop: // Iterative Refinement loop
       if (simplePrec)
         doubleToSimple(solCLA, vectorSize);
 
-      ierr = hi->solve_systems(hmatrix, solCLA, nbRHS);
+      ierr = hi->solve_systems(hmatrix, solCLA, nbRHS); CHKERRQ(ierr);
 
       if (simplePrec)
         simpleToDouble(solCLA, vectorSize);
@@ -276,9 +277,8 @@ _refineLoop: // Iterative Refinement loop
       temps_cpu = (temps_final - temps_initial) ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuSolve%s = %f \n", postfix_async, temps_cpu) ;
 
-      /* ===================================================================================== */
       /* Compute the residual solCLA_orig-A_orig.solCLA */
-      /* ===================================================================================== */
+
       if (hmat_residual) {
         temps_initial = getTime ();
         if (simplePrec) {
@@ -310,8 +310,8 @@ _refineLoop: // Iterative Refinement loop
           for (k=0 ; k<vectorSize ; k++)
             ((double*)solCLA_sum)[k] += ((double*)solCLA)[k];
         }
-        ierr=computeRelativeError(solCLA_sum, rhs, &eps) ; CHKERRQ(ierr) ;
-        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error_%d = %.4e \n", nbIter, eps);
+        ierr=computeRelativeError(solCLA_sum, rhs, relative_error) ; CHKERRQ(ierr) ;
+        Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error_%d = %.4e \n", nbIter, *relative_error);
       }
       if (hmat_refine) {
         /* Copy the updated RHS of the system solCLA_orig into solCLA */
@@ -321,7 +321,8 @@ _refineLoop: // Iterative Refinement loop
 
       if (hmatrix_orig) hi->destroy(hmatrix_orig);
       hmatrix_orig=NULL;
-      if (solCLA_orig) MpfFree(solCLA_orig) ; solCLA_orig=NULL ;
+      if (solCLA_orig) MpfFree(solCLA_orig);
+      solCLA_orig=NULL ;
       if (solCLA_sum) { // If solCLA_sum has been allocated, I use it as a replacement for solCLA
         MpfFree(solCLA) ;
         solCLA=solCLA_sum ;
@@ -334,18 +335,17 @@ _refineLoop: // Iterative Refinement loop
         ierr = displayArray("x", solCLA, nbPts, nbRHS) ; CHKERRQ(ierr) ;
       }
 
-      /* ===================================================================================== */
       /* Compare the two vectors solCLA and rhs */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solCLA, rhs, &eps) ; CHKERRQ(ierr) ;
+      ierr=computeRelativeError(solCLA, rhs, relative_error); CHKERRQ(ierr);
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
       break;
     case _inverseHMAT:
-      /* ===================================================================================== */
+
       /* Inverse the H-matrix */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Inversing HMAT...\n") ;
       temps_initial = getTime ();
 
@@ -355,19 +355,18 @@ _refineLoop: // Iterative Refinement loop
       temps_cpu = (temps_final - temps_initial) ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuInversion%s = %f \n", postfix_async, temps_cpu) ;
 
-      /* ===================================================================================== */
       /* Compute the gemv : A-1.solCLA -> solCLA */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** GEMV HMAT...\n") ;
       temps_initial = getTime ();
 
       if (simplePrec)
         doubleToSimple(solCLA, vectorSize);
 
-      void *solCLA2=(void*)MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA) ;
+      void *solCLA2 = MpfCalloc(vectorSize, sizeof(D_type)) ; CHKPTRQ(solCLA) ;
       memcpy(solCLA2, solCLA, vectorSize*sizeof(D_type));
       ierr = hi->gemv('N', Mpf_pone[stype], hmatrix, solCLA2, Mpf_zero[stype], solCLA, nbRHS);
-      if (solCLA2) MpfFree(solCLA2) ; solCLA2=NULL ;
+      MpfFree(solCLA2);
 
       if (simplePrec)
         simpleToDouble(solCLA, vectorSize);
@@ -376,22 +375,23 @@ _refineLoop: // Iterative Refinement loop
       temps_cpu = (temps_final - temps_initial) ;
       Mpf_printf(MPI_COMM_WORLD,"<PERFTESTS> TpsCpuGEMV%s = %f \n", postfix_async, temps_cpu) ;
 
-      /* ===================================================================================== */
       /* Compare the two vectors solCLA and rhs */
-      /* ===================================================================================== */
+
       Mpf_printf(MPI_COMM_WORLD, "\n**** Comparing results...\n") ;
-      ierr=computeRelativeError(solCLA, rhs, &eps) ; CHKERRQ(ierr) ;
+      ierr=computeRelativeError(solCLA, rhs, relative_error); CHKERRQ(ierr);
 
-      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", eps);
+      Mpf_printf(MPI_COMM_WORLD, "<PERFTESTS> Error = %.4e \n", *relative_error);
       break;
     default:
       break;
   } /* switch(testedAlgo) */
 
   /* Destroys the HMAT structure */
+#ifdef TESTFEMBEM_DUMP_HMAT
   hi->dump_info(hmatrix, "testHMAT_matrix");
+#endif
   HMAT_destroy_matrix( hi, hdesc );
-  if (solCLA) MpfFree(solCLA) ; solCLA=NULL ;
+  MpfFree(solCLA);
   hmat_tracing_dump("testHMAT_trace.json");
 
   leave_context();
diff --git a/src/util.c b/src/util.c
index ea8873279279ecda21cfa10a37b4177adfac26ad..ac7de08ab9a9cbcbb7e25438ffe62913158dbb4f 100644
--- a/src/util.c
+++ b/src/util.c
@@ -424,6 +424,32 @@ int MpfArgGetString( int *Argc, char **argv, int rflag, const char *name, char *
   return 1;
 }
 /* ================================================================================== */
+/*! \brief Equivalent of MPI_Allreduce() with a 64 bit \a count argument
+   */
+int MPI_AllreduceL(const void *sendbuf, void *recvbuf, ssize_t count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) {
+  int ierr, scalsize;
+  size_t pos=0, pak=0;
+  /* MPI maximum message size (in bytes) */
+  static size_t PACKET_SIZE=1e8;
+
+  enter_context("MPI_AllreduceL");
+
+  ierr = MPI_Type_size(datatype, &scalsize); CHKERRQ(ierr);
+  pak = PACKET_SIZE/scalsize; // number of 'datatype' to send in each message
+  while (count>0) {
+    // We communicate by packet of 100 Mbytes
+    void *src = sendbuf==MPI_IN_PLACE ? MPI_IN_PLACE : (void*)((char*)sendbuf+pos) ;
+    void *dest = (void*)((char*)recvbuf+pos) ;
+    ierr = MPI_Allreduce(src, dest, (int)MIN(pak,count), datatype, op, comm); CHKERRQ(ierr);
+    pos += pak*scalsize;
+    count -= pak;
+  }
+
+  leave_context();
+
+  return 0;
+}
+/* ================================================================================== */
 /*! \brief Sort the array mat, remove the zeros and sum the elements placed at the same position
 
   For simple precision only (S)
@@ -842,6 +868,7 @@ int SCAB_Init(int* argc, char*** argv) {
   mpf_hmat_settings.acaEpsilon = 1e-3;
   mpf_hmat_settings.compressionMethod = hmat_compress_aca_plus;
   mpf_hmat_settings.epsilon = 1e-3;
+  mpf_hmat_settings.max_leaf_size = -1;
   if(strcmp(hmat_get_version(), HMAT_VERSION))
     Mpf_printf(MPI_COMM_WORLD, "***\n*** hmat version %s (compiled with version %s)\n", hmat_get_version(), HMAT_VERSION);
   else
@@ -917,6 +944,9 @@ int SCAB_Init(int* argc, char*** argv) {
                mpf_hmat_settings.l0size);
 #endif
 
+  if (MpfArgGetInt(argc, *argv, 1, "--hmat-leaf-size", &mpf_hmat_settings.max_leaf_size))
+    Mpf_printf(MPI_COMM_WORLD,"[HMat] Maximum leaf size: %d\n", mpf_hmat_settings.max_leaf_size);
+
   if (MpfArgHasName( argc, *argv, 1, "--hmat-coarsening" ) > 0) {
     Mpf_printf(MPI_COMM_WORLD,"[HMat] Coarsening of the HMatrix\n");
     settings.coarsening = 1;