diff --git a/Benchmark/axpy/axpy.cpp b/Benchmark/axpy/axpy.cpp
index 448bd33c1cc5116a96f86f6097efe713bc08ee2e..58f1db0e2f2258d1a9adf09d51d12d847a24de43 100644
--- a/Benchmark/axpy/axpy.cpp
+++ b/Benchmark/axpy/axpy.cpp
@@ -253,8 +253,8 @@ int main(int argc, char** argv){
 
     for(auto useMultiprioAndPairs: schedPairConf){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
             const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+            const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
             const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
             const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
             const bool useLocality = std::get<2>(useMultiprioAndPairs);
@@ -291,8 +291,8 @@ int main(int argc, char** argv){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){
                 for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                     const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                    const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
 
                     file << nbCpus << "," << nbGpus << "," << idxNbBlocks << "," << idxSize << "," 
                     << (useMultiprio?"TRUE":"FALSE") << ","
diff --git a/Benchmark/cholesky_gemm/CMakeLists.txt b/Benchmark/cholesky_gemm/CMakeLists.txt
index 55f4cc58a498d1dc48b602591d176a5a2150bb6c..2db323656e303f80435578c793a4ca84069365ff 100644
--- a/Benchmark/cholesky_gemm/CMakeLists.txt
+++ b/Benchmark/cholesky_gemm/CMakeLists.txt
@@ -14,12 +14,28 @@ find_package(BLAS)
 find_package(LAPACK)
 
 if(BLAS_FOUND AND LAPACK_FOUND)
-	# If libs is related to MKL
-	if(${BLAS_LIBRARIES} MATCHES "mkl")
-	    # We replace "intel_thread" by "sequential"
-		string(REPLACE "intel_thread" "sequential" BLAS_LIBRARIES ${BLAS_LIBRARIES})
-		string(REPLACE "intel_thread" "sequential" LAPACK_LIBRARIES ${LAPACK_LIBRARIES})
-	endif()
+        # If libs are related to MKL
+    if("${BLAS_LIBRARIES}" MATCHES "mkl")
+        # Create new variables to store the modified lists
+        set(MODIFIED_BLAS_LIBRARIES "")
+        set(MODIFIED_LAPACK_LIBRARIES "")
+
+        # Loop over each BLAS library and perform the replacement
+        foreach(lib ${BLAS_LIBRARIES})
+            string(REPLACE "intel_thread" "sequential" modified_lib ${lib})
+            list(APPEND MODIFIED_BLAS_LIBRARIES ${modified_lib})
+        endforeach()
+
+        # Loop over each LAPACK library and perform the replacement
+        foreach(lib ${LAPACK_LIBRARIES})
+            string(REPLACE "intel_thread" "sequential" modified_lib ${lib})
+            list(APPEND MODIFIED_LAPACK_LIBRARIES ${modified_lib})
+        endforeach()
+
+        # Replace original variables with the modified lists
+        set(BLAS_LIBRARIES ${MODIFIED_BLAS_LIBRARIES})
+        set(LAPACK_LIBRARIES ${MODIFIED_LAPACK_LIBRARIES})
+    endif()
 
 	if($ENV{VERBOSE})
 	    MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LIBRARIES   : ${BLAS_LIBRARIES}")
diff --git a/Benchmark/cholesky_gemm/cholesky-mpi.cpp b/Benchmark/cholesky_gemm/cholesky-mpi.cpp
index 203986780f29affb472033ea5017d621dd41b77c..2236bfa43ab3cd11b18ec70cfb2bace903af3603 100644
--- a/Benchmark/cholesky_gemm/cholesky-mpi.cpp
+++ b/Benchmark/cholesky_gemm/cholesky-mpi.cpp
@@ -71,7 +71,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
     else{
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1 , SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler));
 #else
     SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
 #endif
diff --git a/Benchmark/cholesky_gemm/cholesky.cpp b/Benchmark/cholesky_gemm/cholesky.cpp
index 6e85cb174a53327b43766a09401c95f479b6ca43..e514b44ae641cfe38cb81ef9c4505a4acbefe306 100644
--- a/Benchmark/cholesky_gemm/cholesky.cpp
+++ b/Benchmark/cholesky_gemm/cholesky.cpp
@@ -437,8 +437,8 @@ int main(int argc, char** argv){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
                 for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                     const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                    const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
 
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
@@ -509,8 +509,8 @@ int main(int argc, char** argv){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
                 for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                     const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                    const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
 
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
diff --git a/Benchmark/cholesky_gemm/gemm-mpi.cpp b/Benchmark/cholesky_gemm/gemm-mpi.cpp
index d2e01e82b04b012688257ed6c6da964095e9dd3e..8632cc3df2c1b9cf2c379d0a3e4e29f7145572b7 100644
--- a/Benchmark/cholesky_gemm/gemm-mpi.cpp
+++ b/Benchmark/cholesky_gemm/gemm-mpi.cpp
@@ -67,7 +67,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
     else{
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1, SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler));
 #else
     SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
 #endif
diff --git a/Benchmark/cholesky_gemm/gemm.cpp b/Benchmark/cholesky_gemm/gemm.cpp
index 49f7b34db0b54797240c18637808922e690385ab..27e9c07b2a559816f7f3406060b56cefbc213202 100644
--- a/Benchmark/cholesky_gemm/gemm.cpp
+++ b/Benchmark/cholesky_gemm/gemm.cpp
@@ -269,8 +269,8 @@ int main(int argc, char** argv){
         for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
             for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
                 for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                     const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                    const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
 
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool useLocality = std::get<1>(useMultiprioAndPairs);
@@ -342,8 +342,8 @@ int main(int argc, char** argv){
         for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
             for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
                 for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                     const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                    const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
 
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool useLocality = std::get<1>(useMultiprioAndPairs);
diff --git a/Benchmark/particles/particles-simu.cpp b/Benchmark/particles/particles-simu.cpp
index 94adf1d79692458b03f25d82b917d5422da23b6d..dafb4ecaa6ca4280b10219904bcf5415d3acf46a 100644
--- a/Benchmark/particles/particles-simu.cpp
+++ b/Benchmark/particles/particles-simu.cpp
@@ -919,8 +919,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
     for(auto useMultiprioAndPairs: schedPairConf){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                 const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
                 const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                 const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                 const bool useLocality = std::get<2>(useMultiprioAndPairs);
@@ -954,8 +954,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
     for(auto useMultiprioAndPairs: schedPairConf){
         for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
-                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
                 const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+                const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus);
                 const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                 const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                 const bool useLocality = std::get<2>(useMultiprioAndPairs);
diff --git a/Benchmark/particles_openmp/CMakeLists.txt b/Benchmark/particles_openmp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c6373f41f304cfbfb36d782b93a7840e73f1987d
--- /dev/null
+++ b/Benchmark/particles_openmp/CMakeLists.txt
@@ -0,0 +1,65 @@
+###########################################################################
+# SPECX - Berenger Bramas MPCDF - 2016
+# Under LGPL Licence, please you must read the LICENCE file.
+###########################################################################
+project(BENCHMARK_CP_SPECX_OMP CXX)
+
+ADD_DEFINITIONS(${SPECX_CXX_FLAGS})
+
+if($ENV{VERBOSE})
+    MESSAGE(STATUS "Benchmark -- SPECX_CXX_FLAGS   : ${BENCHMARK_CP_SPECX_CXX_FLAGS}")
+endif()
+
+
+find_package(OpenMP)
+
+if(OpenMP_FOUND)
+    list(APPEND BENCHMARK_CP_SPECX_OMP_LIBRARIES OpenMP::OpenMP_CXX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+    # Find all code files
+    file(	
+        GLOB_RECURSE
+        source_tests_files
+        ./*.cpp
+        )
+
+
+    # Adding the project sources dir as an include dir
+    INCLUDE_DIRECTORIES(
+         ${SPECX_BINARY_DIR}/Src    
+         ${SPECX_SOURCE_DIR}/Src    
+    )
+
+
+    # Add execs - 1 cpp = 1 exec
+    foreach(exec ${source_tests_files})
+        if(${SPECX_COMPILE_WITH_MPI} OR NOT ${exec} MATCHES "-mpi\.")
+            get_filename_component(
+	            execname ${exec}
+	            NAME_WE
+            )
+
+            if($ENV{VERBOSE})
+                MESSAGE(STATUS "Benchmark -- Add ${execname}")
+            endif()
+
+            if(SPECX_COMPILE_WITH_CUDA)
+                set_source_files_properties(${exec} PROPERTIES LANGUAGE CUDA)
+            endif()
+            add_executable(
+		            ${execname}
+		            ${exec}
+	            )
+
+            target_link_libraries(
+                ${execname}
+                specx
+                ${SPECX_LIBRARIES}
+                ${BENCHMARK_CP_SPECX_OMP_LIBRARIES}
+                )
+        endif()
+    endforeach(exec)
+else(OpenMP_FOUND)
+    MESSAGE_STATUS(WARNING "Openmp not found, disable particle simulation")
+endif(OpenMP_FOUND)
diff --git a/Benchmark/particles_openmp/particles-simu-omp.cpp b/Benchmark/particles_openmp/particles-simu-omp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..293f4b457da8a7d02f26f5ab5f84a4c2e09753cd
--- /dev/null
+++ b/Benchmark/particles_openmp/particles-simu-omp.cpp
@@ -0,0 +1,662 @@
+///////////////////////////////////////////////////////////////////////////
+// Specx - Berenger Bramas MPCDF - 2017
+// Under LGPL Licence, please you must read the LICENCE file.
+///////////////////////////////////////////////////////////////////////////
+#include <utility>
+#include <thread>
+#include <chrono>
+#include <iostream>
+
+#include <clsimple.hpp>
+
+#include "Data/SpDataAccessMode.hpp"
+#include "Utils/SpUtils.hpp"
+#include "Task/SpTask.hpp"
+
+#include "Compute/SpComputeEngine.hpp"
+#include "Compute/SpWorkerTeamBuilder.hpp"
+#include "TaskGraph/SpTaskGraph.hpp"
+#include "Config/SpConfig.hpp"
+#include "Utils/SpTimer.hpp"
+
+#include "Scheduler/SpMultiPrioScheduler.hpp"
+
+#include <omp.h>
+
+
+class ParticlesGroup{
+public:
+    enum ValueTypes{
+        PHYSICAL,
+        X,
+        Y,
+        Z,
+        FX,
+        FY,
+        FZ,
+        POTENTIAL,
+        NB_VALUE_TYPES
+    };
+
+private:
+    std::size_t nbParticles;
+    std::vector<double> values[NB_VALUE_TYPES];
+
+public:
+    explicit ParticlesGroup(const std::size_t inNbParticles = 0)
+        : nbParticles(inNbParticles){
+        for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){
+            values[idxValueType].resize(nbParticles, 0);
+        }
+    }
+
+    ParticlesGroup(const ParticlesGroup&) = default;
+    ParticlesGroup(ParticlesGroup&&) = default;
+    ParticlesGroup& operator=(const ParticlesGroup&) = default;
+    ParticlesGroup& operator=(ParticlesGroup&&) = default;
+
+    void setParticleValues(const std::size_t inIdxParticle,
+                           const std::array<double, NB_VALUE_TYPES>& inValues){
+        assert(inIdxParticle < nbParticles);
+        for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){
+            values[idxValueType][inIdxParticle] = inValues[idxValueType];
+        }
+    }
+
+    auto getParticle(const std::size_t inIdxParticle) const {
+        std::array<double, NB_VALUE_TYPES> valuesPart;
+        for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){
+            valuesPart[idxValueType] = values[idxValueType][inIdxParticle];
+        }
+        return valuesPart;
+    }
+
+    auto getNbParticles() const{
+        return nbParticles;
+    }
+
+    void computeSelf(){
+        for(std::size_t idxTarget = 0 ; idxTarget < getNbParticles() ; ++idxTarget){
+            const double tx = double(values[X][idxTarget]);
+            const double ty = double(values[Y][idxTarget]);
+            const double tz = double(values[Z][idxTarget]);
+            const double tv = double(values[PHYSICAL][idxTarget]);
+
+            double  tfx = double(0.);
+            double  tfy = double(0.);
+            double  tfz = double(0.);
+            double  tpo = double(0.);
+
+            for(std::size_t idxSource = idxTarget+1  ; idxSource < nbParticles ; ++idxSource){
+                double dx = tx - double(values[X][idxSource]);
+                double dy = ty - double(values[Y][idxSource]);
+                double dz = tz - double(values[Z][idxSource]);
+
+                double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz);
+                const double inv_distance = sqrt(inv_square_distance);
+
+                inv_square_distance *= inv_distance;
+                inv_square_distance *= tv * double(values[PHYSICAL][idxSource]);
+
+                dx *= - inv_square_distance;
+                dy *= - inv_square_distance;
+                dz *= - inv_square_distance;
+
+                tfx += dx;
+                tfy += dy;
+                tfz += dz;
+                tpo += inv_distance * double(values[PHYSICAL][idxSource]);
+
+                values[FX][idxSource] -= dx;
+                values[FY][idxSource] -= dy;
+                values[FZ][idxSource] -= dz;
+                values[POTENTIAL][idxSource] += inv_distance * tv;
+            }
+
+            values[FX][idxTarget] += tfx;
+            values[FY][idxTarget] += tfy;
+            values[FZ][idxTarget] += tfz;
+            values[POTENTIAL][idxTarget] += tpo;
+        }
+    }
+
+    void compute(ParticlesGroup& inOther){
+        for(std::size_t idxTarget = 0 ; idxTarget < inOther.getNbParticles() ; ++idxTarget){
+            const double tx = double(inOther.values[X][idxTarget]);
+            const double ty = double(inOther.values[Y][idxTarget]);
+            const double tz = double(inOther.values[Z][idxTarget]);
+            const double tv = double(inOther.values[PHYSICAL][idxTarget]);
+
+            double  tfx = double(0.);
+            double  tfy = double(0.);
+            double  tfz = double(0.);
+            double  tpo = double(0.);
+
+            for(std::size_t idxSource = 0  ; idxSource < nbParticles ; ++idxSource){
+                double dx = tx - double(values[X][idxSource]);
+                double dy = ty - double(values[Y][idxSource]);
+                double dz = tz - double(values[Z][idxSource]);
+
+                double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz);
+                const double inv_distance = sqrt(inv_square_distance);
+
+                inv_square_distance *= inv_distance;
+                inv_square_distance *= tv * double(values[PHYSICAL][idxSource]);
+
+                dx *= - inv_square_distance;
+                dy *= - inv_square_distance;
+                dz *= - inv_square_distance;
+
+                tfx += dx;
+                tfy += dy;
+                tfz += dz;
+                tpo += inv_distance * double(values[PHYSICAL][idxSource]);
+
+                values[FX][idxSource] -= dx;
+                values[FY][idxSource] -= dy;
+                values[FZ][idxSource] -= dz;
+                values[POTENTIAL][idxSource] += inv_distance * tv;
+            }
+
+            inOther.values[FX][idxTarget] += tfx;
+            inOther.values[FY][idxTarget] += tfy;
+            inOther.values[FZ][idxTarget] += tfz;
+            inOther.values[POTENTIAL][idxTarget] += tpo;
+        }
+    }
+
+    /////////////////////////////////////////////////////////////
+
+    #ifdef SPECX_COMPILE_WITH_CUDA  
+    struct DataDescr {
+        std::size_t nbParticles;
+        double* values;
+
+        double* getRawPtr() const{
+            return values;
+        }
+
+        size_t getRawSize() const{
+            return nbParticles*sizeof(float)*NB_VALUE_TYPES;
+        }
+    };
+
+    auto memmovHostToDevice(cudaStream_t stream){
+        double* doubleDevicePtr;
+        CUDA_ASSERT(cudaMalloc(&doubleDevicePtr, nbParticles*NB_VALUE_TYPES*sizeof(double)));
+        for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){
+            CUDA_ASSERT(cudaMemcpyAsync(&doubleDevicePtr[idxValueType*nbParticles], values[idxValueType].data(), nbParticles*sizeof(double), cudaMemcpyHostToDevice, stream));
+        }
+        return DataDescr{nbParticles, doubleDevicePtr};
+    }
+
+    void memmovDeviceToHost(DataDescr& inDescr, cudaStream_t stream){
+        double* doubleDevicePtr = inDescr.values;
+        for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){
+            CUDA_ASSERT(cudaMemcpyAsync(values[idxValueType].data(), &doubleDevicePtr[idxValueType*nbParticles], nbParticles*sizeof(double), cudaMemcpyDeviceToHost, stream));
+        }
+        CUDA_ASSERT(cudaFreeAsync(doubleDevicePtr, stream));
+    }
+    #endif
+};
+
+#if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP)
+template <class T>
+__device__ T CuMin(const T& v1, const T& v2){
+    return v1 < v2 ? v1 : v2;
+}
+
+__global__ void p2p_inner_gpu(void* data, std::size_t size){
+    const std::size_t nbParticles = size/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES;
+    double* values[ParticlesGroup::NB_VALUE_TYPES];
+    for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){
+        values[idxValueType] = reinterpret_cast<double*>(data)+idxValueType*nbParticles;
+    }
+
+    constexpr std::size_t SHARED_MEMORY_SIZE = 128;
+    const std::size_t nbThreads = blockDim.x*gridDim.x;
+    const std::size_t uniqueId = threadIdx.x + blockIdx.x*blockDim.x;
+    const std::size_t nbIterations = ((nbParticles+nbThreads-1)/nbThreads)*nbThreads;
+
+    for(std::size_t idxTarget = uniqueId ; idxTarget < nbIterations ; idxTarget += nbThreads){
+        const bool threadCompute = (idxTarget<nbParticles);
+
+        double tx;
+        double ty;
+        double tz;
+        double tv;
+
+        if(threadCompute){
+            tx = double(values[ParticlesGroup::X][idxTarget]);
+            ty = double(values[ParticlesGroup::Y][idxTarget]);
+            tz = double(values[ParticlesGroup::Z][idxTarget]);
+            tv = double(values[ParticlesGroup::PHYSICAL][idxTarget]);
+        }
+
+        double  tfx = double(0.);
+        double  tfy = double(0.);
+        double  tfz = double(0.);
+        double  tpo = double(0.);
+
+        for(std::size_t idxCopy = 0 ; idxCopy < nbParticles ; idxCopy += SHARED_MEMORY_SIZE){
+            __shared__ double sourcesX[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesY[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesZ[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesPhys[SHARED_MEMORY_SIZE];
+
+            const std::size_t nbCopies = CuMin(SHARED_MEMORY_SIZE, nbParticles-idxCopy);
+            for(std::size_t idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
+                sourcesX[idx] = values[ParticlesGroup::X][idx+idxCopy];
+                sourcesY[idx] = values[ParticlesGroup::Y][idx+idxCopy];
+                sourcesZ[idx] = values[ParticlesGroup::Z][idx+idxCopy];
+                sourcesPhys[idx] = values[ParticlesGroup::PHYSICAL][idx+idxCopy];
+            }
+
+            __syncthreads();
+
+            if(threadCompute){
+                for(std::size_t otherIndex = 0; otherIndex < nbCopies; ++otherIndex) {
+                    if(idxCopy + otherIndex != idxTarget){
+                        double dx = tx - sourcesX[otherIndex];
+                        double dy = ty - sourcesY[otherIndex];
+                        double dz = tz - sourcesZ[otherIndex];
+
+                        double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz);
+                        const double inv_distance = sqrt(inv_square_distance);
+
+                        inv_square_distance *= inv_distance;
+                        inv_square_distance *= tv * sourcesPhys[otherIndex];
+
+                        dx *= - inv_square_distance;
+                        dy *= - inv_square_distance;
+                        dz *= - inv_square_distance;
+
+                        tfx += dx;
+                        tfy += dy;
+                        tfz += dz;
+                        tpo += inv_distance * sourcesPhys[otherIndex];
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if( threadCompute ){
+            values[ParticlesGroup::FX][idxTarget] += tfx;
+            values[ParticlesGroup::FY][idxTarget] += tfy;
+            values[ParticlesGroup::FZ][idxTarget] += tfz;
+            values[ParticlesGroup::POTENTIAL][idxTarget] += tpo;
+
+        }
+
+        __syncthreads();
+    }
+}
+
+__global__ void p2p_neigh_gpu(const void* dataSrc, std::size_t sizeSrc,
+                              void* dataTgt, std::size_t sizeTgt){
+    const std::size_t nbParticlesTgt = sizeTgt/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES;
+    double* valuesTgt[ParticlesGroup::NB_VALUE_TYPES];
+    for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){
+        valuesTgt[idxValueType] = reinterpret_cast<double*>(dataTgt)+idxValueType*nbParticlesTgt;
+    }
+
+    const std::size_t nbParticlesSrc = sizeSrc/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES;
+    const double* valuesSrc[ParticlesGroup::NB_VALUE_TYPES];
+    for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){
+        valuesSrc[idxValueType] = reinterpret_cast<const double*>(dataSrc)+idxValueType*nbParticlesSrc;
+    }
+
+    constexpr std::size_t SHARED_MEMORY_SIZE = 128;
+    const std::size_t nbThreads = blockDim.x*gridDim.x;
+    const std::size_t uniqueId = threadIdx.x + blockIdx.x*blockDim.x;
+    const std::size_t nbIterations = ((nbParticlesTgt+nbThreads-1)/nbThreads)*nbThreads;
+
+    for(std::size_t idxTarget = uniqueId ; idxTarget < nbIterations ; idxTarget += nbThreads){
+        const bool threadCompute = (idxTarget<nbParticlesTgt);
+
+        double tx;
+        double ty;
+        double tz;
+        double tv;
+
+        if(threadCompute){
+            tx = double(valuesTgt[ParticlesGroup::X][idxTarget]);
+            ty = double(valuesTgt[ParticlesGroup::Y][idxTarget]);
+            tz = double(valuesTgt[ParticlesGroup::Z][idxTarget]);
+            tv = double(valuesTgt[ParticlesGroup::PHYSICAL][idxTarget]);
+        }
+
+        double  tfx = double(0.);
+        double  tfy = double(0.);
+        double  tfz = double(0.);
+        double  tpo = double(0.);
+
+        for(std::size_t idxCopy = 0 ; idxCopy < nbParticlesSrc ; idxCopy += SHARED_MEMORY_SIZE){
+            __shared__ double sourcesX[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesY[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesZ[SHARED_MEMORY_SIZE];
+            __shared__ double sourcesPhys[SHARED_MEMORY_SIZE];
+
+            const std::size_t nbCopies = CuMin(SHARED_MEMORY_SIZE, nbParticlesSrc-idxCopy);
+            for(std::size_t idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
+                sourcesX[idx] = valuesSrc[ParticlesGroup::X][idx+idxCopy];
+                sourcesY[idx] = valuesSrc[ParticlesGroup::Y][idx+idxCopy];
+                sourcesZ[idx] = valuesSrc[ParticlesGroup::Z][idx+idxCopy];
+                sourcesPhys[idx] = valuesSrc[ParticlesGroup::PHYSICAL][idx+idxCopy];
+            }
+
+            __syncthreads();
+
+            if(threadCompute){
+                for(std::size_t otherIndex = 0; otherIndex < nbCopies; ++otherIndex) {
+                    double dx = tx - sourcesX[otherIndex];
+                    double dy = ty - sourcesY[otherIndex];
+                    double dz = tz - sourcesZ[otherIndex];
+
+                    double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz);
+                    const double inv_distance = sqrt(inv_square_distance);
+
+                    inv_square_distance *= inv_distance;
+                    inv_square_distance *= tv * sourcesPhys[otherIndex];
+
+                    dx *= - inv_square_distance;
+                    dy *= - inv_square_distance;
+                    dz *= - inv_square_distance;
+
+                    tfx += dx;
+                    tfy += dy;
+                    tfz += dz;
+                    tpo += inv_distance * sourcesPhys[otherIndex];
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if( threadCompute ){
+            valuesTgt[ParticlesGroup::FX][idxTarget] += tfx;
+            valuesTgt[ParticlesGroup::FY][idxTarget] += tfy;
+            valuesTgt[ParticlesGroup::FZ][idxTarget] += tfz;
+            valuesTgt[ParticlesGroup::POTENTIAL][idxTarget] += tpo;
+        }
+
+        __syncthreads();
+    }
+}
+
+#endif
+
+
+#include <random>
+
+template <class NumType = double>
+void FillRandomValues(ParticlesGroup& inGroup, const int inSeed,
+                      const NumType inPhysicalValue = 0.1,
+                      const NumType inMinPos = 0, const NumType inMaxPos = 1){
+    std::mt19937 gen(inSeed);
+    std::uniform_real_distribution<> distrib(inMinPos, inMaxPos);
+
+    std::array<double, ParticlesGroup::NB_VALUE_TYPES> vecPart;
+    std::fill(vecPart.begin(), vecPart.end(), 0);
+    vecPart[ParticlesGroup::PHYSICAL] = inPhysicalValue;
+
+    for(std::size_t idxPart = 0 ; idxPart < inGroup.getNbParticles() ; ++idxPart){
+        vecPart[ParticlesGroup::X] = distrib(gen);
+        vecPart[ParticlesGroup::Y] = distrib(gen);
+        vecPart[ParticlesGroup::Z] = distrib(gen);
+        inGroup.setParticleValues(idxPart, vecPart);
+    }
+}
+
+template <class ValueType = double>
+ValueType ChechAccuracy(const ParticlesGroup& inGroup1, const ParticlesGroup& inGroup2){
+    if(inGroup1.getNbParticles() != inGroup2.getNbParticles()){
+        return std::numeric_limits<ValueType>::infinity();
+    }
+
+    ValueType maxDiff = 0;
+    for(std::size_t idx = 0 ; idx < inGroup1.getNbParticles() ; ++idx){
+        const auto p1 = inGroup1.getParticle(idx);
+        const auto p2 = inGroup2.getParticle(idx);
+
+        for(std::size_t idxVal = 0 ; idxVal < ParticlesGroup::NB_VALUE_TYPES ; ++idxVal){
+            maxDiff = std::max(maxDiff, std::abs((p1[idxVal] - p2[idxVal])/(p1[idxVal] == 0? 1 : p1[idxVal])));
+        }
+    }
+    return maxDiff;
+}
+
+
+struct TuneResult{
+    int nbThreadsInner = 0;
+    int nbBlocksInner = 0;
+    int nbThreadsOuter = 0;
+    int nbBlocksOuter = 0;
+};
+
+
+template <int MaxNbDevices, const bool FavorLocality>
+auto GetPriority(const bool usePrioPairs, const int maxInteractions, const int minInteractions, const int currentInteractions){
+    if(usePrioPairs){
+        const int ratio = 5 * double(currentInteractions-minInteractions)/double(maxInteractions-minInteractions);
+        const int invRatio = 5 - ratio;
+        return SpPriority(SpMultiPrioScheduler<MaxNbDevices,FavorLocality>::GeneratePriorityWorkerPair(invRatio, ratio));
+    }
+    return SpPriority(0);
+}
+
+
+bool isCpuWorker;
+#pragma omp threadprivate(isCpuWorker)
+
+
+#ifdef SPECX_COMPILE_WITH_CUDA
+// Cuda stream
+cudaStream_t threadStream;
+#pragma omp threadprivate(threadStream)
+#endif
+
+template <int MaxNbDevices, const bool FavorLocality>
+auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPartsPerGroup,
+                const int NbGroups, const int nbCpu, const int nbGpu, const TuneResult& inKernelConfig,
+                const int maxInteractions, const int minInteractions){
+
+    std::vector<ParticlesGroup> particleGroups(NbGroups);
+
+    {
+        std::random_device rd;
+        std::uniform_int_distribution<int> dist(MinPartsPerGroup, MaxPartsPerGroup);
+        for(auto& group : particleGroups){
+            const int nbParts = dist(rd);
+            group = ParticlesGroup(nbParts);
+        }
+    }  
+
+    std::vector<double> minMaxAvg(3);
+    minMaxAvg[0] = std::numeric_limits<double>::max();
+    minMaxAvg[1] = std::numeric_limits<double>::min();
+    minMaxAvg[2] = 0;
+
+    for(int idxLoop = 0 ; idxLoop < NbLoops ; ++idxLoop){
+        #pragma omp parallel
+        {
+            isCpuWorker = true;
+            #ifdef SPECX_COMPILE_WITH_CUDA
+            if(nbCpu <= omp_get_thread_num() ){
+                isCpuWorker = false;
+                const int deviceId = (omp_get_thread_num() - nbCpu)/SpCudaUtils::GetNbDevices();
+                CUDA_ASSERT(cudaSetDevice(deviceId));
+                CUDA_ASSERT(cudaStreamCreate(&threadStream));
+            }
+            #endif
+
+            #pragma omp master
+            {
+                SpTimer timer;
+                #pragma omp taskgroup
+                {
+
+                    for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){
+                        auto& group1 = particleGroups[idxGroup1];
+                        const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group1.getNbParticles()).getPriority();
+                        #pragma omp task depend(mutexinout:group1) firstprivate(group1) priority(priority) default(none)
+                        {
+                            if(isCpuWorker){
+                                group1.computeSelf();
+                            }
+                            #ifdef SPECX_COMPILE_WITH_CUDA
+                            else{
+                                auto descr = group1.memmovHostToDevice(threadStream);
+                                p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,threadStream>>>
+                                                 (descr.getRawPtr(), descr.getRawSize());
+                                group1.memmovDeviceToHost(descr, threadStream);
+                                CUDA_ASSERT(cudaStreamSynchronize(threadStream));
+                            }
+                            #endif
+                        }
+                    }
+
+                    for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){
+                        auto& group1 = particleGroups[idxGroup1];
+
+                        const int idxGroup2 = (idxGroup1+1)%particleGroups.size();
+                        auto& group2 = particleGroups[idxGroup2];
+
+                        const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group2.getNbParticles()).getPriority();
+
+                        #pragma omp task depend(mutexinout:group1, group2) firstprivate(group1, group2) priority(priority) default(none)
+                        {
+                            if(isCpuWorker){
+                                group1.compute(group2);
+                            }
+                            #ifdef SPECX_COMPILE_WITH_CUDA
+                            else{
+                                auto descr1 = group1.memmovHostToDevice(threadStream);
+                                auto descr2 = group2.memmovHostToDevice(threadStream);
+                                p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>>
+                                                (descr2.getRawPtr(), descr2.getRawSize(), descr1.getRawPtr(), descr1.getRawSize());
+                                p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>>
+                                                (descr1.getRawPtr(), descr1.getRawSize(), descr2.getRawPtr(), descr2.getRawSize());
+                                group1.memmovDeviceToHost(descr1, threadStream);
+                                group2.memmovDeviceToHost(descr2, threadStream);
+                                CUDA_ASSERT(cudaStreamSynchronize(threadStream));
+                            }
+                            #endif
+                        }
+                    }
+                }
+
+                timer.stop();
+
+                minMaxAvg[0] = std::min(minMaxAvg[0], timer.getElapsed());
+                minMaxAvg[1] = std::max(minMaxAvg[1], timer.getElapsed());
+                minMaxAvg[2] += timer.getElapsed();
+            }
+
+            #pragma omp barrier
+
+            #ifdef SPECX_COMPILE_WITH_CUDA
+            if(isCpuWorker == false){
+                CUDA_ASSERT(cudaStreamDestroy(threadStream));
+            }
+            #endif
+        }
+    }
+
+    minMaxAvg[2] /= NbLoops;
+
+    return minMaxAvg;
+}
+
+void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
+    CLsimple args("Particles", argc, argv);
+
+    args.addParameterNoArg({"help"}, "help");
+
+    int NbLoops = 100;
+    args.addParameter<int>({"lp" ,"nbloops"}, "NbLoops", NbLoops, 100);
+
+    int MinPartsPerGroup;
+    args.addParameter<int>({"minp"}, "MinPartsPerGroup", MinPartsPerGroup, 100);
+
+    int MaxPartsPerGroup;
+    args.addParameter<int>({"maxp"}, "MaxPartsPerGroup", MaxPartsPerGroup, 300);
+
+    int MinNbGroups;
+    args.addParameter<int>({"minnbgroups"}, "MinNbGroups", MinNbGroups, 10);
+
+    int MaxNbGroups;
+    args.addParameter<int>({"maxnbgroups"}, "MaxNbGroups", MaxNbGroups, 10);
+
+    std::string outputDir = "./";
+    args.addParameter<std::string>({"od"}, "outputdir", outputDir, outputDir);
+
+    args.parse();
+
+    if(!args.isValid() || args.hasKey("help")){
+      // Print the help
+      args.printHelp(std::cout);
+      return;
+    }
+
+    const int maxInteractions = MaxPartsPerGroup*MaxPartsPerGroup;
+    const int minInteractions = MinPartsPerGroup*MinPartsPerGroup;
+
+    assert(MinPartsPerGroup <= MaxPartsPerGroup);
+    assert(MinNbGroups <= MaxNbGroups);
+
+#ifdef SPECX_COMPILE_WITH_CUDA  
+    SpCudaUtils::PrintInfo(); 
+    const int nbMaxGpus = SpCudaUtils::GetNbDevices();
+#else    
+    const int nbMaxGpus = 0;
+#endif 
+
+    std::vector<std::vector<double>> allDurations;
+
+    for(int idxGpu = -1 ; idxGpu <= nbMaxGpus ; ++idxGpu){
+        for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
+            const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+            const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
+            std::cout << "NbGpu = " << idxGpu << " BlockSize = " << idxBlock << std::endl;              
+            const auto minMaxAvg = BenchCore<8,true>(NbLoops, MinPartsPerGroup,
+                                                    MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, inKernelConfig,
+                                                    maxInteractions, minInteractions);
+            allDurations.push_back(minMaxAvg);
+            std::cout << " - Min = " << minMaxAvg[0] << " Max = " << minMaxAvg[1] << " Avg = " << minMaxAvg[2] << std::endl;  
+        }
+    }
+
+    std::ofstream file(outputDir + "/particle-simu-omp.csv");
+    if(!file.is_open()){
+        std::cerr << "Cannot open file " << outputDir + "/particle-simu-omp.csv" << std::endl;
+        return;
+    }
+
+    file << "NbCpu,NbGpu,BlockSize,MinDuration,MaxDuration,AvgDuration" << std::endl;
+    int idxDuration = 0;
+    for(int idxGpu = -1 ; idxGpu <= nbMaxGpus ; ++idxGpu){
+        for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
+            const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+            const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
+            file << nbCpus << "," << nbGpus << "," << idxBlock << "," 
+                    << allDurations[idxDuration][0] << "," 
+                    << allDurations[idxDuration][1] << "," 
+                    << allDurations[idxDuration][2] << std::endl;
+            idxDuration += 1;
+        }
+    }
+}
+
+
+int main(int argc, char** argv){
+
+    TuneResult tuneConfig{256, 128, 256, 128};
+    BenchmarkTest(argc, argv, tuneConfig);
+
+    return 0;
+}
diff --git a/Src/Compute/SpWorkerTeamBuilder.hpp b/Src/Compute/SpWorkerTeamBuilder.hpp
index 9c02b7d60401a189cf06bd7ed60fd5417159de1f..833e618675b2ab97a714e61d91e677b2a0dc3d67 100644
--- a/Src/Compute/SpWorkerTeamBuilder.hpp
+++ b/Src/Compute/SpWorkerTeamBuilder.hpp
@@ -42,9 +42,11 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfCudaWorkers(const int nbWor
     return res;
 }
 
-static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(),
+static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkersInit = -1,
                                              int nbCudaWorkers = SpCudaUtils::GetNbDevices(),
                                              const int nbWorkerPerCudas = SpCudaUtils::GetDefaultNbStreams()) {
+    const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbCudaWorkers));
+
     if(SpCudaUtils::GetNbDevices() < nbCudaWorkers){
         std::cout << "[SPECX] The number of devices asked ("
                   << nbCudaWorkers << ") is above the real number of devices ("
@@ -76,7 +78,7 @@ static auto TeamOfCpuGpuWorkers(Args&& ... args) {
 }
 #endif
 #ifdef SPECX_COMPILE_WITH_HIP
-static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(),
+static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHipsInit = SpHipUtils::GetDefaultNbStreams(),
                                              int nbHipWorkers = SpHipUtils::GetNbDevices()) {
     if(SpHipUtils::GetNbDevices() < nbHipWorkers){
         std::cout << "[SPECX] The number of devices asked ("
@@ -99,9 +101,10 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWork
     return res;
 }
 
-static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(),
+static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = -1,
                                              const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(),
                                              int nbHipWorkers = SpHipUtils::GetNbDevices()) {
+    const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbWorkerPerHips));
     if(SpHipUtils::GetNbDevices() < nbHipWorkers){
         std::cout << "[SPECX] The number of devices asked ("
                   << nbHipWorkers << ") is above the real number of devices ("