diff --git a/Batch/run-benchs.sh b/Batch/run-benchs.sh
index 5c4c62d8ef99c5687a9a58a8954bc0807a9529bb..f9d9d1224f069b07559d5ed8ab600c0142d892f7 100644
--- a/Batch/run-benchs.sh
+++ b/Batch/run-benchs.sh
@@ -43,8 +43,6 @@ function main(){
         return 1
     fi
     
-    cd "/home/bramas/spetabaru-project/results/"
-
     # Create a directory to store the results
     # with the format results_[date]_[time]
     results_dir="results-$PREFIX-$(date +%Y%m%d_%H%M%S)"
@@ -56,18 +54,18 @@ function main(){
     NB_LOOPS=10
 
     # AXPY
-    "$RUN_DIR/Benchmark/axpy/axpy" --lp=$NB_LOOPS --minnbb=16 --maxnbb=256 --minbs=128 --maxbs=65536 --gputh=256 --od="$results_dir" >> "$results_dir/output_axpy.txt"
+    "$RUN_DIR/Benchmark/axpy/axpy" --lp=$NB_LOOPS --minnbb=256 --maxnbb=1024 --minbs=131072 --maxbs=524288 --gputh=256 --od="$results_dir" >> "$results_dir/output_axpy.txt"
     remove_core_files
 
     # Cholesky/gemm
-    "$RUN_DIR/Benchmark/cholesky_gemm/cholesky" --lp=$NB_LOOPS --minms=4096 --maxms=8192 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_cholesky.txt"
+    "$RUN_DIR/Benchmark/cholesky_gemm/cholesky" --lp=$NB_LOOPS --minms=4096 --maxms=16384 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_cholesky.txt"
     remove_core_files
 
-    "$RUN_DIR/Benchmark/cholesky_gemm/gemm" --lp=$NB_LOOPS --minms=4096 --maxms=8192 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_gemm.txt"
+    "$RUN_DIR/Benchmark/cholesky_gemm/gemm" --lp=$NB_LOOPS --minms=4096 --maxms=16384 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_gemm.txt"
     remove_core_files
 
     # Particles
-    "$RUN_DIR/Benchmark/particles/particles-simu" --lp=$NB_LOOPS --minp=500 --maxp=8000 --minnbgroups=128 --maxnbgroups=512 --od="$results_dir" >> "$results_dir/output_particles.txt"
+    "$RUN_DIR/Benchmark/particles/particles-simu" --lp=$NB_LOOPS --minp=500 --maxp=512 --minnbgroups=128 --maxnbgroups=1024 --od="$results_dir" >> "$results_dir/output_particles.txt"
     remove_core_files
 }
 
diff --git a/Benchmark/axpy/axpy.cpp b/Benchmark/axpy/axpy.cpp
index de4f1b31507807b95fb331ea63a8b809184b0c46..448bd33c1cc5116a96f86f6097efe713bc08ee2e 100644
--- a/Benchmark/axpy/axpy.cpp
+++ b/Benchmark/axpy/axpy.cpp
@@ -76,7 +76,8 @@ __global__ void cu_axpy(int n, NumType a, NumType *x, NumType *y, NumType *out)
 #endif
 
 
-auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const int blocksize, const int gpunbthreads,
+template <int MaxNbDevices, const bool FavorLocality>
+auto BenchmarkTest(const int NbLoops, const int nbCpu, const int nbGpu, const int nbblocks, const int blocksize, const int gpunbthreads,
                     const bool useMultiPrioScheduler){   
     std::vector<Vector<float>> x(nbblocks, Vector<float>(blocksize, 1));
     std::vector<Vector<float>> y(nbblocks, Vector<float>(blocksize, 1));
@@ -89,11 +90,11 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler));
 #else
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu));
 #endif
 
     std::vector<double> minMaxAvg(3);
@@ -123,6 +124,7 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const
                     const int gpunbblocks = (size + gpunbthreads-1)/gpunbthreads;
                     cu_axpy<float><<<gpunbblocks, gpunbthreads,0,SpCudaUtils::GetCurrentStream()>>>
                         (size, a, (float*)paramX.getRawPtr(), (float*)paramY.getRawPtr(), (float*)paramZ.getRawPtr());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                 })
 #endif
 #ifdef SPECX_COMPILE_WITH_HIP
@@ -133,6 +135,7 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const
                     const int gpunbblocks = (size + gpunbthreads-1)/gpunbthreads;
                     hipLaunchKernelGGL( cu_axpy<float>, gpunbblocks, gpunbthreads,0,SpHipUtils::GetCurrentStream(),
                         size, a, (float*)paramX.getRawPtr(), (float*)paramY.getRawPtr(), (float*)paramZ.getRawPtr());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                 })
 #endif
             );
@@ -242,13 +245,27 @@ int main(int argc, char** argv){
 
     std::vector<std::vector<double>> allDurations;
 
-    for(bool useMultiprio: std::vector<bool>{true, false}){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+    const auto schedPairConf = std::vector<std::tuple<bool,bool,bool>>{std::make_tuple(false, false,false),
+                                                                 std::make_tuple(true, false, false),
+                                                                 std::make_tuple(true, true, false),
+                                                                 std::make_tuple(true, true, true),
+                                                                 std::make_tuple(true, false, true)};
+
+    for(auto useMultiprioAndPairs: schedPairConf){
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+            const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+            const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
+            const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
+            const bool useLocality = std::get<2>(useMultiprioAndPairs);
             std::cout << " - Gpu = " << idxGpu << " Multiprio = " << (useMultiprio?"TRUE":"FALSE") << std::endl;
             for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){
                 for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){
+
                     std::cout << "  - NbBlocks = " << idxNbBlocks << " BlockSize = " << idxSize << std::endl;
-                    const auto minMaxAvg = BenchmarkTest(NbLoops, idxGpu, idxNbBlocks, idxSize, gpunbthreads, useMultiprio);
+                    const auto minMaxAvg = (useLocality ? 
+                                            BenchmarkTest<8,true>(NbLoops, nbCpus, nbGpus, idxNbBlocks, idxSize, gpunbthreads, useMultiprio)
+                                            : BenchmarkTest<8,false>(NbLoops, nbCpus, nbGpus, idxNbBlocks, idxSize, gpunbthreads, useMultiprio));
                     std::cout << "     - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl;
                     std::cout << "     - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl;
                     std::cout << "     - End" << std::endl;
@@ -265,16 +282,22 @@ int main(int argc, char** argv){
         return 1;
     }
 
-    file << "NbGpu,NbBlocks,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
+    file << "NbCpus,NbGpu,NbBlocks,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
     int idxDuration = 0;
-    for(bool useMultiprio: std::vector<bool>{true, false}){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+    for(auto useMultiprioAndPairs: schedPairConf){
+        const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
+        const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
+        const bool useLocality = std::get<2>(useMultiprioAndPairs);
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){
                 for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){
-                    file << idxGpu << "," << idxNbBlocks << "," << idxSize << "," 
-                        << (useMultiprio?"TRUE":"FALSE") << ","
-                        << "FALSE" << ","
-                        << "FALSE" << ","
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                    const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
+                    file << nbCpus << "," << nbGpus << "," << idxNbBlocks << "," << idxSize << "," 
+                    << (useMultiprio?"TRUE":"FALSE") << ","
+                        << (usePrioPairs?"TRUE":"FALSE") << ","
+                        << (useLocality?"TRUE":"FALSE") << ","
                         << allDurations[idxDuration][0] << "," 
                         << allDurations[idxDuration][1] << "," 
                         << allDurations[idxDuration][2] << ","
diff --git a/Benchmark/cholesky_gemm/CMakeLists.txt b/Benchmark/cholesky_gemm/CMakeLists.txt
index c743c852249f13f08024ec0ba70091cbedea257e..55f4cc58a498d1dc48b602591d176a5a2150bb6c 100644
--- a/Benchmark/cholesky_gemm/CMakeLists.txt
+++ b/Benchmark/cholesky_gemm/CMakeLists.txt
@@ -14,6 +14,13 @@ find_package(BLAS)
 find_package(LAPACK)
 
 if(BLAS_FOUND AND LAPACK_FOUND)
+	# If libs is related to MKL
+	if(${BLAS_LIBRARIES} MATCHES "mkl")
+	    # We replace "intel_thread" by "sequential"
+		string(REPLACE "intel_thread" "sequential" BLAS_LIBRARIES ${BLAS_LIBRARIES})
+		string(REPLACE "intel_thread" "sequential" LAPACK_LIBRARIES ${LAPACK_LIBRARIES})
+	endif()
+
 	if($ENV{VERBOSE})
 	    MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LIBRARIES   : ${BLAS_LIBRARIES}")
 	    MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LINKER_FLAGS   : ${BLAS_LINKER_FLAGS}")
diff --git a/Benchmark/cholesky_gemm/cholesky-mpi.cpp b/Benchmark/cholesky_gemm/cholesky-mpi.cpp
index b30af01dea959c69534b205e60d2f13eed3a1fc8..203986780f29affb472033ea5017d621dd41b77c 100644
--- a/Benchmark/cholesky_gemm/cholesky-mpi.cpp
+++ b/Benchmark/cholesky_gemm/cholesky-mpi.cpp
@@ -69,7 +69,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
     SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
 #else
@@ -165,6 +165,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                         handle.solverBuffer,
                         handle.solverBuffSize,
                         handle.cuinfo));
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
     #ifndef NDEBUG
                     int info;
                     CUDA_ASSERT(cudaMemcpy(&info, handle.cuinfo, sizeof(int), cudaMemcpyDeviceToHost));
@@ -183,6 +184,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                             handle.solverBuffer,
                             handle.solverBuffSize,
                             handle.cuinfo));
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
     #ifndef NDEBUG
                         int info;
                         HIP_ASSERT(hipMemcpy(&info, handle.cuinfo, sizeof(int), hipMemcpyDeviceToHost));
@@ -216,6 +218,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                                     CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
                                 inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                 (double*)paramB.getRawPtr(), inBlockDim));
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -226,6 +229,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                                     HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT,
                                 inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                 (double*)paramB.getRawPtr(), inBlockDim));
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
         #endif
                         ).setTaskName(std::string("TRSM -- (R-")+std::to_string(k)+","+std::to_string(k)+") (W-"+std::to_string(m)+","+std::to_string(k)+")");
@@ -256,6 +260,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                         CUBLAS_ASSERT( cublasDsyrk( handle.blasHandle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
                                 inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim,
                                 &one, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -267,6 +272,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                         HIPBLAS_ASSERT(hipblasDsyrk( handle.blasHandle, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N,
                                 inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim,
                                 &one, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
         #endif
                         ).setTaskName(std::string("SYRK -- (R-")+std::to_string(n)+","+std::to_string(k)+") (W-"+std::to_string(n)+","+std::to_string(n)+")");
@@ -296,6 +302,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                     inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                     (const double*)paramB.getRawPtr(), inBlockDim,
                                     &minusone, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                         })
             #endif
             #ifdef SPECX_COMPILE_WITH_HIP
@@ -307,6 +314,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                     inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                     (const double*)paramB.getRawPtr(), inBlockDim,
                                     &minusone, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                         })
             #endif
                             ).setTaskName(std::string("GEMM -- (R-")+std::to_string(m)+","+std::to_string(k)+")(R-"+std::to_string(n)+","+std::to_string(k)+")(W-"+std::to_string(m)+","+std::to_string(n)+")");
diff --git a/Benchmark/cholesky_gemm/cholesky.cpp b/Benchmark/cholesky_gemm/cholesky.cpp
index 26764d6314e757cd1e3e66f3a39881864af58dc7..6e85cb174a53327b43766a09401c95f479b6ca43 100644
--- a/Benchmark/cholesky_gemm/cholesky.cpp
+++ b/Benchmark/cholesky_gemm/cholesky.cpp
@@ -58,7 +58,7 @@ thread_local HipHandles handle;
 
 template <int MaxNbDevices, const bool FavorLocality>
 auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const int inMatrixDim, const int inBlockDim,
-                           const int nbGpu, const bool useMultiPrioScheduler, const bool usePrioPairs){
+                           const int nbCpu, const int nbGpu, const bool useMultiPrioScheduler, const bool usePrioPairs){
     const int nbBlocks = (inMatrixDim+inBlockDim-1)/inBlockDim;
 
 #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP)
@@ -67,11 +67,11 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler));
 #else
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu));
 #endif
 
 #ifdef SPECX_COMPILE_WITH_CUDA
@@ -162,6 +162,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                     handle.solverBuffer,
                     handle.solverBuffSize,
                     handle.cuinfo));
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
     #ifndef NDEBUG
                 int info;
                 CUDA_ASSERT(cudaMemcpy(&info, handle.cuinfo, sizeof(int), cudaMemcpyDeviceToHost));
@@ -180,6 +181,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                     handle.solverBuffer,
                     handle.solverBuffSize,
                     handle.cuinfo));
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
     #ifndef NDEBUG
                 int info;
                 HIP_ASSERT(hipMemcpy(&info, handle.cuinfo, sizeof(int), hipMemcpyDeviceToHost));
@@ -206,6 +208,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                                 CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
                             inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                             (double*)paramB.getRawPtr(), inBlockDim));
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                 })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -216,6 +219,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                             HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT,
                             inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                             (double*)paramB.getRawPtr(), inBlockDim));
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                 })
         #endif
                         ).setTaskName(std::string("TRSM -- (R-")+std::to_string(k)+","+std::to_string(k)+") (W-"+std::to_string(m)+","+std::to_string(k)+")");
@@ -239,6 +243,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                     CUBLAS_ASSERT( cublasDsyrk( handle.blasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
                             inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim,
                             &one, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                 })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -249,6 +254,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                     HIPBLAS_ASSERT(hipblasDsyrk( handle.blasHandle, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N,
                             inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim,
                             &one, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                 })
         #endif
                         ).setTaskName(std::string("SYRK -- (R-")+std::to_string(n)+","+std::to_string(k)+") (W-"+std::to_string(n)+","+std::to_string(n)+")");
@@ -271,6 +277,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                 inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                 (const double*)paramB.getRawPtr(), inBlockDim,
                                 &minusone, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
             #endif
             #ifdef SPECX_COMPILE_WITH_HIP
@@ -282,6 +289,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const
                                 inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim,
                                 (const double*)paramB.getRawPtr(), inBlockDim,
                                 &minusone, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
             #endif
                             ).setTaskName(std::string("GEMM -- (R-")+std::to_string(m)+","+std::to_string(k)+")(R-"+std::to_string(n)+","+std::to_string(k)+")(W-"+std::to_string(m)+","+std::to_string(n)+")");
@@ -426,9 +434,12 @@ int main(int argc, char** argv){
                                                                  std::make_tuple(true, false, true)};
 
     for(auto useMultiprioAndPairs: schedPairConf){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
                 for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                    const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                     const bool useLocality = std::get<2>(useMultiprioAndPairs);
@@ -460,9 +471,9 @@ int main(int argc, char** argv){
                     /////////////////////////////////////////////////////////
                     const auto minMaxAvg = (useLocality ? 
                                                 choleskyFactorization<8,true>(NbLoops, blocks.get(), MatrixSize, BlockSize, 
-                                                                idxGpu, useMultiprio, usePrioPairs) :
+                                                                nbCpus, nbGpus, useMultiprio, usePrioPairs) :
                                                 choleskyFactorization<8,false>(NbLoops, blocks.get(), MatrixSize, BlockSize, 
-                                                                idxGpu, useMultiprio, usePrioPairs));
+                                                                nbCpus, nbGpus, useMultiprio, usePrioPairs));
                     allDurations.push_back(minMaxAvg);
                     std::cout << "     - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl;
                     std::cout << "     - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl;
@@ -492,17 +503,20 @@ int main(int argc, char** argv){
         return 1;
     }
 
-    file << "NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
+    file << "NbCpu,NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
     int idxDuration = 0;
     for(auto useMultiprioAndPairs: schedPairConf){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
                 for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                    const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                     const bool useLocality = std::get<2>(useMultiprioAndPairs);
 
-                    file << idxGpu << "," << MatrixSize << "," << BlockSize << "," 
+                    file << nbCpus << "," << nbGpus << "," << MatrixSize << "," << BlockSize << "," 
                         << (useMultiprio?"TRUE":"FALSE") << ","
                         << (usePrioPairs?"TRUE":"FALSE") << ","
                         << (useLocality?"TRUE":"FALSE") << ","
diff --git a/Benchmark/cholesky_gemm/gemm-mpi.cpp b/Benchmark/cholesky_gemm/gemm-mpi.cpp
index 6ce8348396184e87898d49caabf110bd7eab2618..d2e01e82b04b012688257ed6c6da964095e9dd3e 100644
--- a/Benchmark/cholesky_gemm/gemm-mpi.cpp
+++ b/Benchmark/cholesky_gemm/gemm-mpi.cpp
@@ -65,7 +65,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
     SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
 #else
@@ -138,7 +138,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
         for(int k = 0 ; k < processBlockDim ; ++k){
             for(int i = 0 ; i < processBlockDim ; ++i){
                 for(int j = 0 ; j < processBlockDim ; ++j){
-                    tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]),
+                    tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]),
                             SpRead(blocksA[k*processBlockDim+j]), SpRead(blocksB[i*processBlockDim+k]),
                         SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){
                             SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL,
@@ -155,6 +155,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                     inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                     (const double*)paramB.getRawPtr(), inBlockDim,
                                     &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                         })
                 #endif
                 #ifdef SPECX_COMPILE_WITH_HIP
@@ -166,6 +167,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                     inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                     (const double*)paramB.getRawPtr(), inBlockDim,
                                     &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                         })
                 #endif
                     ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")");
@@ -186,7 +188,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                 for(int k = 0 ; k < processBlockDim ; ++k){
                     for(int i = 0 ; i < processBlockDim ; ++i){
                         for(int j = 0 ; j < processBlockDim ; ++j){
-                            tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]),
+                            tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]),
                                     SpRead(blocksA[k*processBlockDim+j]), SpRead(buffersB[i*processBlockDim+k]),
                                 SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){
                                     SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL,
@@ -203,6 +205,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                             inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                             (const double*)paramB.getRawPtr(), inBlockDim,
                                             &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                                 })
                         #endif
                         #ifdef SPECX_COMPILE_WITH_HIP
@@ -214,6 +217,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                             inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                             (const double*)paramB.getRawPtr(), inBlockDim,
                                             &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                                 })
                         #endif
                             ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")");
@@ -236,7 +240,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                 for(int k = 0 ; k < processBlockDim ; ++k){
                     for(int i = 0 ; i < processBlockDim ; ++i){
                         for(int j = 0 ; j < processBlockDim ; ++j){
-                            tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]),
+                            tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]),
                                     SpRead(buffersA[k*processBlockDim+j]), SpRead(blocksB[i*processBlockDim+k]),
                                 SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){
                                     SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL,
@@ -253,6 +257,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                             inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                             (const double*)paramB.getRawPtr(), inBlockDim,
                                             &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                                 })
                         #endif
                         #ifdef SPECX_COMPILE_WITH_HIP
@@ -264,6 +269,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                             inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                             (const double*)paramB.getRawPtr(), inBlockDim,
                                             &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                                 })
                         #endif
                             ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")");
diff --git a/Benchmark/cholesky_gemm/gemm.cpp b/Benchmark/cholesky_gemm/gemm.cpp
index 0098fc30624fa229786f49e48c7b75923c24d974..49f7b34db0b54797240c18637808922e690385ab 100644
--- a/Benchmark/cholesky_gemm/gemm.cpp
+++ b/Benchmark/cholesky_gemm/gemm.cpp
@@ -46,8 +46,9 @@ thread_local hipblasHandle_t handle;
 template <int MaxNbDevices, const bool FavorLocality>
 auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocksA[], const SpBlas::Block blocksB[],
                            const int inMatrixDim, const int inBlockDim,
-                           const int nbGpu, const bool useMultiPrioScheduler){
+                           const int nbCpus, const int nbGpu, const bool useMultiPrioScheduler){
     const int nbBlocks = (inMatrixDim+inBlockDim-1)/inBlockDim;
+    const bool exportTrace = false;
 
 #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP)
     std::unique_ptr<SpAbstractScheduler> scheduler;
@@ -55,11 +56,11 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpus, nbGpu), std::move(scheduler));
 #else
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpus));
 #endif
 
 #ifdef SPECX_COMPILE_WITH_CUDA
@@ -99,7 +100,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
         for(int i = 0 ; i < nbBlocks ; ++i){
             for(int j = 0 ; j < nbBlocks ; ++j){
                 for(int k = 0 ; k < nbBlocks ; ++k){
-                    tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*nbBlocks+j]),
+                    tg.task(SpPriority(1), SpWrite(blocksC[i*nbBlocks+j]),
                             SpRead(blocksA[k*nbBlocks+j]), SpRead(blocksB[i*nbBlocks+k]),
                         SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){
                             SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL,
@@ -116,6 +117,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                     inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                     (const double*)paramB.getRawPtr(), inBlockDim,
                                     &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                         })
                 #endif
                 #ifdef SPECX_COMPILE_WITH_HIP
@@ -127,6 +129,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
                                         inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim,
                                         (const double*)paramB.getRawPtr(), inBlockDim,
                                         &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) );
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                             })
                 #endif
                     ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")");
@@ -138,6 +141,11 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks
         tg.waitAllTasks();
         timer.stop();
 
+        if(exportTrace){
+            static int cptTrace = 0;
+            tg.generateTrace("./trace" + std::to_string(cptTrace++) + "-" + std::to_string(idxLoop) + ".svg");
+        }
+
 #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP)
         for(int i = 0 ; i < nbBlocks ; ++i){
             for(int j = 0 ; j < nbBlocks ; ++j){
@@ -260,7 +268,10 @@ int main(int argc, char** argv){
     for(auto useMultiprioAndPairs: schedPairConf){
         for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
             for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
-                for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+                for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                    const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool useLocality = std::get<1>(useMultiprioAndPairs);
 
@@ -293,9 +304,9 @@ int main(int argc, char** argv){
                     auto blocksC = SpBlas::matrixToBlock(matrixC.get(), MatrixSize, BlockSize);
                     const auto minMaxAvg = (useLocality ? 
                                             gemm<8,true>(NbLoops, blocksC.get(), blocksA.get(), blocksB.get(), 
-                                                MatrixSize, BlockSize, idxGpu, useMultiprio):
+                                                MatrixSize, BlockSize, nbCpus, nbGpus, useMultiprio):
                                             gemm<8,false>(NbLoops, blocksC.get(), blocksA.get(), blocksB.get(), 
-                                                MatrixSize, BlockSize, idxGpu, useMultiprio));
+                                                MatrixSize, BlockSize, nbCpus, nbGpus, useMultiprio));
                     allDurations.push_back(minMaxAvg);
                     std::cout << "     - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl;
                     std::cout << "     - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl;
@@ -325,16 +336,19 @@ int main(int argc, char** argv){
         return 1;
     }
 
-    file << "NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
+    file << "NbCpu,NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
     int idxDuration = 0;
     for(auto useMultiprioAndPairs: schedPairConf){
         for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){
             for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){
-                for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+                for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                    const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
+
                     const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                     const bool useLocality = std::get<1>(useMultiprioAndPairs);
 
-                    file << idxGpu << "," << MatrixSize << "," << BlockSize << "," 
+                    file << nbCpus << "," << nbGpus << "," << MatrixSize << "," << BlockSize << "," 
                         << (useMultiprio?"TRUE":"FALSE") << ","
                         << "FALSE" << ","
                         << (useLocality?"TRUE":"FALSE") << ","
diff --git a/Benchmark/particles/particles-simu-mpi.cpp b/Benchmark/particles/particles-simu-mpi.cpp
index 8094384d74066246a418cd365f1ff26d2d1fdcfe..a5afc99ed2cc8223941d6210885550677d5f834f 100644
--- a/Benchmark/particles/particles-simu-mpi.cpp
+++ b/Benchmark/particles/particles-simu-mpi.cpp
@@ -531,12 +531,14 @@ void AccuracyTest(){
             , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
             , SpHip([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 hipLaunchKernelGGL(p2p_inner_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -549,12 +551,14 @@ void AccuracyTest(){
             , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
             , SpHip([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 hipLaunchKernelGGL(p2p_inner_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -571,6 +575,7 @@ void AccuracyTest(){
                                                                          paramA.getRawPtr(), paramA.getRawSize());
                 p2p_neigh_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize(),
                                                                          paramB.getRawPtr(), paramB.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -581,6 +586,7 @@ void AccuracyTest(){
                                                                          paramA.getRawPtr(), paramA.getRawSize());
                 hipLaunchKernelGGL(p2p_neigh_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize(),
                                                                          paramB.getRawPtr(), paramB.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -784,7 +790,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
     SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(), std::move(scheduler));
 #else
@@ -819,6 +825,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                         [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                         p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,SpCudaUtils::GetCurrentStream()>>>
                                        (paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
                 #endif
                 #ifdef SPECX_COMPILE_WITH_HIP
@@ -826,6 +833,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                         [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                         hipLaunchKernelGGL(p2p_inner_gpu, dim3(inKernelConfig.nbBlocksInner), dim3(inKernelConfig.nbThreadsInner), 0, SpHipUtils::GetCurrentStream(),
                                      paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
                 #endif
             );
@@ -847,6 +855,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                                          (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
                             p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>>
                                          (paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                         })
                     #endif
                     #ifdef SPECX_COMPILE_WITH_HIP
@@ -857,6 +866,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                                          paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
                             hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(),
                                          paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                         })
                     #endif
                 );
@@ -874,6 +884,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                     [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles();
                     p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>>
                                  (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                 })
             #endif
             #ifdef SPECX_COMPILE_WITH_HIP
@@ -882,6 +893,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                     [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles();
                     hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(),
                                  paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                 })
             #endif
         );
@@ -897,6 +909,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                     [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles();
                     p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>>
                                  (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                 })
             #endif
             #ifdef SPECX_COMPILE_WITH_HIP
@@ -905,6 +918,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                     [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles();
                     hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(),
                                  paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                 })
             #endif
         );
diff --git a/Benchmark/particles/particles-simu.cpp b/Benchmark/particles/particles-simu.cpp
index 567490257caf5942b3bc556018fe7a0769d9911d..94adf1d79692458b03f25d82b917d5422da23b6d 100644
--- a/Benchmark/particles/particles-simu.cpp
+++ b/Benchmark/particles/particles-simu.cpp
@@ -465,6 +465,7 @@ void AccuracyTest(){
             , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -472,6 +473,7 @@ void AccuracyTest(){
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 hipLaunchKernelGGL(p2p_inner_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(), 
                         paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -484,6 +486,7 @@ void AccuracyTest(){
             , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) {
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -491,6 +494,7 @@ void AccuracyTest(){
                 [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                 hipLaunchKernelGGL(p2p_inner_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(),
                         paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -507,6 +511,7 @@ void AccuracyTest(){
                                                                          paramA.getRawPtr(), paramA.getRawSize());
                 p2p_neigh_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize(),
                                                                          paramB.getRawPtr(), paramB.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             })
         #endif
         #ifdef SPECX_COMPILE_WITH_HIP
@@ -519,6 +524,7 @@ void AccuracyTest(){
                 hipLaunchKernelGGL(p2p_neigh_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(),
                                         paramA.getRawPtr(), paramA.getRawSize(),
                                          paramB.getRawPtr(), paramB.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
             })
         #endif
     );
@@ -698,7 +704,7 @@ auto GetPriority(const bool usePrioPairs, const int maxInteractions, const int m
 
 template <int MaxNbDevices, const bool FavorLocality>
 auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPartsPerGroup,
-                const int NbGroups, const int nbGpu, const bool useMultiPrioScheduler,
+                const int NbGroups, const int nbCpu, const int nbGpu, const bool useMultiPrioScheduler,
                 const bool usePrioPairs, const TuneResult& inKernelConfig,
                 const int maxInteractions, const int minInteractions){
 
@@ -719,11 +725,11 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
         scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler());
     }
     else{
-        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>());
+        scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams()));
     }
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler));
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler));
 #else
-    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers());
+    SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu));
 #endif
 
     std::vector<double> minMaxAvg(3);
@@ -749,6 +755,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                         [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                         p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,SpCudaUtils::GetCurrentStream()>>>
                                        (paramA.getRawPtr(), paramA.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
                 #endif
                 #ifdef SPECX_COMPILE_WITH_HIP
@@ -756,6 +763,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                         [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles();
                         hipLaunchKernelGGL(p2p_inner_gpu, inKernelConfig.nbBlocksInner, inKernelConfig.nbThreadsInner, 0, SpHipUtils::GetCurrentStream(),
                                            paramA.getRawPtr(), paramA.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
                 #endif
             );
@@ -780,6 +788,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                                      (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
                         p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>>
                                      (paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize());
+                            CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
                     })
                 #endif
                 #ifdef SPECX_COMPILE_WITH_HIP
@@ -790,6 +799,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart
                                             paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize());
                         hipLaunchKernelGGL(p2p_neigh_gpu, inKernelConfig.nbBlocksOuter, inKernelConfig.nbThreadsOuter, 0, SpHipUtils::GetCurrentStream(),
                                             paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize());
+                                HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream()));
                     })
                 #endif
             );
@@ -907,8 +917,10 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
                                                                  std::make_tuple(true, false, true)};
 
     for(auto useMultiprioAndPairs: schedPairConf){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
                 const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                 const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                 const bool useLocality = std::get<2>(useMultiprioAndPairs);
@@ -919,10 +931,10 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
                               << " Favor Loc = " << useLocality << std::endl;              
                 const auto minMaxAvg = (useLocality ? 
                                         BenchCore<8,true>(NbLoops, MinPartsPerGroup,
-                                            MaxPartsPerGroup, idxBlock, idxGpu, useMultiprio, usePrioPairs, inKernelConfig,
+                                            MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, useMultiprio, usePrioPairs, inKernelConfig,
 					    maxInteractions, minInteractions):
                                         BenchCore<8,false>(NbLoops, MinPartsPerGroup,
-                                            MaxPartsPerGroup, idxBlock, idxGpu, useMultiprio, usePrioPairs, inKernelConfig,
+                                            MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, useMultiprio, usePrioPairs, inKernelConfig,
 					    maxInteractions, minInteractions));
                 allDurations.push_back(minMaxAvg);
                 std::cout << " - Min = " << minMaxAvg[0] << " Max = " << minMaxAvg[1] << " Avg = " << minMaxAvg[2] << std::endl;  
@@ -937,16 +949,18 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){
         return;
     }
 
-    file << "NbGpu,BlockSize,Multiprio,Multiprio,PrioPair,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
+    file << "NbCpu,NbGpu,BlockSize,Multiprio,Multiprio,PrioPair,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl;
     int idxDuration = 0;
     for(auto useMultiprioAndPairs: schedPairConf){
-        for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){
+        for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){
             for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){
+                const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads());
+                const int nbGpus = (idxGpu == -1 ? 0 : idxGpu);
                 const bool useMultiprio = std::get<0>(useMultiprioAndPairs);
                 const bool usePrioPairs = std::get<1>(useMultiprioAndPairs);
                 const bool useLocality = std::get<2>(useMultiprioAndPairs);
 
-                file << idxGpu << "," << idxBlock << "," 
+                file << nbCpus << "," << nbGpus << "," << idxBlock << "," 
                     << (useMultiprio?"TRUE":"FALSE") << ","
                         << (usePrioPairs?"TRUE":"FALSE") << ","
                         << (useLocality?"TRUE":"FALSE") << ","
diff --git a/Src/Cuda/SpCudaMemManager.cpp b/Src/Cuda/SpCudaMemManager.cpp
index a9a72be3147a018a7c3c786e4894aa2841a78bbb..0264a950a1498e52315c1531fab7760dd4965997 100644
--- a/Src/Cuda/SpCudaMemManager.cpp
+++ b/Src/Cuda/SpCudaMemManager.cpp
@@ -1,4 +1,3 @@
 #include "SpCudaMemManager.hpp"
 
 std::vector<SpCudaManager::SpCudaMemManager> SpCudaManager::Managers = SpCudaManager::BuildManagers();
-std::mutex SpCudaManager::CudaMutex;
diff --git a/Src/Cuda/SpCudaMemManager.hpp b/Src/Cuda/SpCudaMemManager.hpp
index 22e7153a1e516a300faee7575bde4d8354605f6a..6820ed700965228615a8d1acff609e623693ad5d 100644
--- a/Src/Cuda/SpCudaMemManager.hpp
+++ b/Src/Cuda/SpCudaMemManager.hpp
@@ -33,18 +33,8 @@ class SpCudaManager {
         int useCount = 0;
     };
 
-    static std::mutex CudaMutex;
-
 public:
 
-    static void Lock(){// Do finer lock TODO
-        CudaMutex.lock();
-    }
-
-    static void Unlock(){
-        CudaMutex.unlock();
-    }
-
     class SpCudaMemManager : public SpAbstractDeviceMemManager {
         const int id;
         std::unordered_map<void*, HandleDescr> handles;
@@ -62,11 +52,22 @@ public:
         size_t hostToDeviceTransfers;
         size_t deviceToDeviceTransfers;
 
+        std::unique_ptr<std::mutex> cudaMutex;
+
     public:
+        void lock(){// Do finer lock TODO
+            cudaMutex->lock();
+        }
+
+        void unlock(){
+            cudaMutex->unlock();
+        }
+
         explicit SpCudaMemManager(const int inId)
             : id(inId), deferCopier(new SpConsumerThread), remainingMemory(0),
                 totalAllocatedMemory(0), currentAllocatedMemory(0), maxAllocatedMemory(0),
-                deviceToHostTransfers(0), hostToDeviceTransfers(0), deviceToDeviceTransfers(0){
+                deviceToHostTransfers(0), hostToDeviceTransfers(0), deviceToDeviceTransfers(0),
+                cudaMutex(new std::mutex){
 
             deferCopier->submitJobAndWait([this]{
                 SpCudaUtils::UseDevice(id);
@@ -106,7 +107,7 @@ public:
         SpCudaMemManager(SpCudaMemManager&&) = default;
 
         SpCudaMemManager& operator=(const SpCudaMemManager&) = delete;
-        SpCudaMemManager& operator=(SpCudaMemManager&&) = delete;
+        SpCudaMemManager& operator=(SpCudaMemManager&&) = default;
 
         void incrDeviceDataUseCount(void* key) override {
             assert(handles.find(key) != handles.end());
@@ -123,7 +124,8 @@ public:
             handles[key].useCount -= 1;
         }
 
-        bool hasBeenRemoved(void* key){
+        bool hasBeenRemovedLocked(void* key){
+            const std::lock_guard<std::mutex> lock(*cudaMutex);
             return (handles.find(key) == handles.end());
         }
 
@@ -164,10 +166,14 @@ public:
 
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
                 CUDA_ASSERT(cudaMallocAsync(&data.ptr, inByteSize, SpCudaUtils::GetCurrentStream()));
+                // Should not be needed, but remove it and it will fail
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
                     CUDA_ASSERT(cudaMallocAsync(&data.ptr, inByteSize, extraStream));
+                    // Should not be needed, but remove it and it will fail
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
 
@@ -206,11 +212,11 @@ public:
             handles.erase(key);
 
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
-                CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
-                    CUDA_ASSERT(cudaStreamSynchronize(extraStream));
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
             
@@ -222,10 +228,14 @@ public:
                     && allBlocks[inPtrDev].size <= inByteSize);
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
                 CUDA_ASSERT(cudaMemsetAsync(inPtrDev, val, inByteSize, SpCudaUtils::GetCurrentStream()));
+                // Should not be needed, but remove it and it will fail
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
                     CUDA_ASSERT(cudaMemsetAsync(inPtrDev, val, inByteSize, extraStream));
+                    // Should not be needed, but remove it and it will fail
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
         }
@@ -237,13 +247,18 @@ public:
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
                 CUDA_ASSERT(cudaMemcpyAsync(inPtrDev, inPtrHost, inByteSize, cudaMemcpyHostToDevice,
                                         SpCudaUtils::GetCurrentStream()));
+                // Should not be needed, but remove it and it will fail
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
                     CUDA_ASSERT(cudaMemcpyAsync(inPtrDev, inPtrHost, inByteSize, cudaMemcpyHostToDevice,
                                             extraStream));
+                    // Should not be needed, but remove it and it will fail
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
+
             hostToDeviceTransfers += inByteSize;
         }
 
@@ -254,14 +269,17 @@ public:
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
                 CUDA_ASSERT(cudaMemcpyAsync(inPtrHost, inPtrDev, inByteSize, cudaMemcpyDeviceToHost,
                                         SpCudaUtils::GetCurrentStream()));
+                // Should not be needed, but remove it and it will fail
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
                     CUDA_ASSERT(cudaMemcpyAsync(inPtrHost, inPtrDev, inByteSize, cudaMemcpyDeviceToHost,
                                             extraStream));
-                    CUDA_ASSERT(cudaStreamSynchronize(extraStream));
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
+
             deviceToHostTransfers += inByteSize;
         }
 
@@ -276,14 +294,17 @@ public:
             if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){
                 CUDA_ASSERT(cudaMemcpyPeerAsync(inPtrDevDest, id, inPtrDevSrc, srcId, inByteSize,
                                             SpCudaUtils::GetCurrentStream()));
+                // Should not be needed, but remove it and it will fail
+                // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream()));
             }
             else{
                 deferCopier->submitJobAndWait([&,this]{
                     CUDA_ASSERT(cudaMemcpyPeerAsync(inPtrDevDest, id, inPtrDevSrc, srcId, inByteSize,
                                                 extraStream));
-                    CUDA_ASSERT(cudaStreamSynchronize(extraStream));
+                    // CUDA_ASSERT(cudaStreamSynchronize(extraStream));
                 });
             }
+
             deviceToDeviceTransfers += inByteSize;
         }
 
diff --git a/Src/Data/SpDataHandle.hpp b/Src/Data/SpDataHandle.hpp
index 218c8aa10641decd6165e48075c9c595eef13a5d..0ec681a557021e52e4cf6af071f8f31ceddb1070 100644
--- a/Src/Data/SpDataHandle.hpp
+++ b/Src/Data/SpDataHandle.hpp
@@ -109,7 +109,9 @@ public:
         assert(cpuDataOk == true);
         for(int idxGpu = 0 ; idxGpu < int(copies.size()) ; ++idxGpu){
             if(copies[idxGpu].ptr){
+                memManagers[idxGpu].lock();
                 deviceDataOp->freeGroup(memManagers[idxGpu], this, ptrToData);
+                memManagers[idxGpu].unlock();
                 copies[idxGpu] = SpDeviceData();
             }
         }
@@ -122,7 +124,9 @@ public:
         cpuDataOk = false;
         for(int idxGpu = 0 ; idxGpu < int(copies.size()) ; ++idxGpu){
             if(idxGpu != gpuId && copies[idxGpu].ptr){
+                memManagers[idxGpu].lock();
                 deviceDataOp->freeGroup(memManagers[idxGpu], this, ptrToData);
+                memManagers[idxGpu].unlock();
                 copies[idxGpu] = SpDeviceData();
             }
         }
@@ -148,7 +152,9 @@ public:
         assert(gpuId < memManagers.size() && gpuId < int(copies.size()));
         assert(copies[gpuId].ptr);
         syncCpuDataIfNeeded(memManagers);
+        //memManagers[idxGpu].lock();
         deviceDataOp->freeGroup(memManagers[gpuId], this, ptrToData);
+        //memManagers[idxGpu].unlock();
         copies[gpuId] = SpDeviceData();
     }
 
@@ -160,11 +166,12 @@ public:
 #elif defined(SPECX_COMPILE_WITH_HIP)
         assert(gpuId < SpConfig::SpMaxNbHips);
 #endif
-        if(copies[gpuId].ptr == nullptr || memManagers[gpuId].hasBeenRemoved(this)){
+        if(copies[gpuId].ptr == nullptr || memManagers[gpuId].hasBeenRemovedLocked(this)){
             copies[gpuId].ptr = nullptr;
             auto idxGpuSrcIter = std::find_if(copies.begin(), copies.end(), [](auto iter) -> bool {
                 return iter.ptr != nullptr;
             });
+            memManagers[gpuId].lock();
             if(!deviceDataOp->hasEnoughSpace(memManagers[gpuId], this, ptrToData)){
                 auto candidates = deviceDataOp->candidatesToBeRemoved(memManagers[gpuId], this, ptrToData);
                 for(auto toRemove : candidates){
@@ -175,6 +182,8 @@ public:
                 }
             }
             copies[gpuId] = deviceDataOp->allocate(memManagers[gpuId], this, ptrToData);
+            memManagers[gpuId].unlock();
+            
             if(idxGpuSrcIter != copies.end()){
                 const int otherGpu = int(std::distance(copies.begin(), idxGpuSrcIter));
                 if(memManagers[gpuId].isConnectedTo(otherGpu)){
diff --git a/Src/Scheduler/SpMultiPrioScheduler.hpp b/Src/Scheduler/SpMultiPrioScheduler.hpp
index 066e702b8fb1a634a9252768d4d0484108dc3e6a..31bd59ae141883da021abae03b2229d374f198f6 100644
--- a/Src/Scheduler/SpMultiPrioScheduler.hpp
+++ b/Src/Scheduler/SpMultiPrioScheduler.hpp
@@ -42,6 +42,8 @@ class SpMultiPrioScheduler : public SpAbstractScheduler{
     std::priority_queue<TaskWrapper, small_vector<TaskWrapper>, ComparePrio > taskQueuesCPU;
 
     std::atomic<int> nbTasks[int(SpWorkerTypes::Type::NB_WORKER_TYPES)];
+    std::atomic<int> nbCommonTasks;
+    int nbGpuWorkers;
 
     void decrTaskCounter(SpAbstractTask* task){
         const bool hasCpuCallable = task->hasCallableOfType(SpCallableType::CPU);
@@ -60,13 +62,25 @@ class SpMultiPrioScheduler : public SpAbstractScheduler{
             nbTasks[int(SpWorkerTypes::Type::HIP_WORKER)] -= 1;
 #endif
         }
+        if(hasCpuCallable && hasGpuCallable){
+            nbCommonTasks -= 1;
+        }
+    }
+
+    bool CpuCanTakeTask() const {
+#if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP)
+        return nbCommonTasks > 80 * nbGpuWorkers
+        || nbTasks[int(SpWorkerTypes::Type::CPU_WORKER)] != nbCommonTasks;
+#endif
+        return true;
     }
 
 public:
-    explicit SpMultiPrioScheduler() {
+    explicit SpMultiPrioScheduler(const int inNbGpuWorkers) : nbGpuWorkers(inNbGpuWorkers) {
         for(int idx = 0 ; idx < int(SpWorkerTypes::Type::NB_WORKER_TYPES) ; ++idx){
             nbTasks[idx] = 0;
         }
+        nbCommonTasks = 0;
     }
 
     // No copy or move
@@ -106,6 +120,9 @@ public:
             nbTasks[int(SpWorkerTypes::Type::HIP_WORKER)] += 1;
 #endif
         }
+        if(hasCpuCallable && hasGpuCallable){
+            nbCommonTasks += 1;
+        }
 
         std::shared_ptr<std::atomic<bool>> taken = std::make_shared<std::atomic<bool>>(false);
 
@@ -145,17 +162,19 @@ public:
         assert(deviceId == -1 || wt != SpWorkerTypes::Type::CPU_WORKER);
 
         if(deviceId == -1){
-            mutexReadyTasksCPU.lock();
-            while(taskQueuesCPU.size()){
-                TaskWrapper taskWrapper = taskQueuesCPU.top();
-                taskQueuesCPU.pop();
-                if(taskWrapper.taken->exchange(true) == false){
-                    mutexReadyTasksCPU.unlock();
-                    decrTaskCounter(taskWrapper.task);
-                    return taskWrapper.task;
+            if(CpuCanTakeTask()){
+                mutexReadyTasksCPU.lock();
+                while(taskQueuesCPU.size()){
+                    TaskWrapper taskWrapper = taskQueuesCPU.top();
+                    taskQueuesCPU.pop();
+                    if(taskWrapper.taken->exchange(true) == false){
+                        mutexReadyTasksCPU.unlock();
+                        decrTaskCounter(taskWrapper.task);
+                        return taskWrapper.task;
+                    }
                 }
+                mutexReadyTasksCPU.unlock();
             }
-            mutexReadyTasksCPU.unlock();
         }
         else{
             mutexReadyTasksGPU[deviceId].lock();
diff --git a/Src/Task/SpTask.hpp b/Src/Task/SpTask.hpp
index d82f1002482d020e17b6edcae228f591267639ad..30e54ad3baa83f6195cf1f57b0c7ab4c00eb56c3 100644
--- a/Src/Task/SpTask.hpp
+++ b/Src/Task/SpTask.hpp
@@ -101,7 +101,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
     
     void preTaskExecution([[maybe_unused]] SpCallableType ct) final {
 #ifdef SPECX_COMPILE_WITH_CUDA
-        SpCudaManager::Lock();
        std::size_t extraHandlesOffset = 0;
         
         SpUtils::foreach_in_tuple(
@@ -143,7 +142,9 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
                     else{
                         SpCudaUtils::SyncCurrentStream();
                     }
+                    SpCudaManager::Managers[cudaId].lock();
                     SpCudaManager::Managers[cudaId].incrDeviceDataUseCount(h);
+                    SpCudaManager::Managers[cudaId].unlock();
                     std::get<index>(cudaCallableArgs).reset(dataObj.ptr, dataObj.size);
                     if constexpr(SpDeviceDataUtils::class_has_setDataDescriptor<decltype(std::get<index>(cudaCallableArgs))>::value){
                         std::get<index>(cudaCallableArgs).setDataDescriptor(dataObj.viewPtr);
@@ -160,9 +161,7 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
             }
         }, this->getDataDependencyTupleRef());
 
-        SpCudaManager::Unlock();
 #elif defined(SPECX_COMPILE_WITH_HIP) // SPECX_COMPILE_WITH_CUDA
-        SpHipManager::Lock();
         std::size_t extraHandlesOffset = 0;
 
         SpUtils::foreach_in_tuple(
@@ -221,7 +220,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
                 }
             }, this->getDataDependencyTupleRef());
 
-        SpHipManager::Unlock();
 #endif // SPECX_COMPILE_WITH_HIP
     }
 
@@ -266,13 +264,11 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
             // Syn only if we the task was on GPU
             SpCudaUtils::SyncCurrentStream();
         }
-        SpCudaManager::Lock();
 #elif defined(SPECX_COMPILE_WITH_HIP)
         if(ct == SpCallableType::HIP){
             // Syn only if we the task was on GPU
             SpHipUtils::SyncCurrentStream();
         }
-        SpHipManager::Lock();
 #endif // SPECX_COMPILE_WITH_HIP
 
         std::size_t extraHandlesOffset = 0;
@@ -298,16 +294,16 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
 #if defined(SPECX_COMPILE_WITH_CUDA)
                 else if(ct == SpCallableType::CUDA){
                     const int cudaId = SpCudaUtils::CurrentCudaId();
-                    h->lock();
+                    SpCudaManager::Managers[cudaId].lock();
                     SpCudaManager::Managers[cudaId].decrDeviceDataUseCount(h);
-                    h->unlock();
+                    SpCudaManager::Managers[cudaId].unlock();
                 }
 #elif defined(SPECX_COMPILE_WITH_HIP)
                 else if(ct == SpCallableType::HIP){
                     const int hipId = SpHipUtils::CurrentHipId();
-                    h->lock();
+                    SpHipManager::Managers[hipId].lock();
                     SpHipManager::Managers[hipId].decrDeviceDataUseCount(h);
-                    h->unlock();
+                    SpHipManager::Managers[hipId].unlock();
                 }
 #endif
                 else{
@@ -315,12 +311,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> {
                 }
             }
         }, this->getDataDependencyTupleRef());
-
-#if defined(SPECX_COMPILE_WITH_CUDA)
-        SpCudaManager::Unlock();
-#elif defined(SPECX_COMPILE_WITH_HIP)
-        SpHipManager::Unlock();
-#endif
 #endif // SPECX_COMPILE_WITH_CUDA SPECX_COMPILE_WITH_HIP
     }