diff --git a/Benchmark/axpy/axpy.cpp b/Benchmark/axpy/axpy.cpp index 448bd33c1cc5116a96f86f6097efe713bc08ee2e..58f1db0e2f2258d1a9adf09d51d12d847a24de43 100644 --- a/Benchmark/axpy/axpy.cpp +++ b/Benchmark/axpy/axpy.cpp @@ -253,8 +253,8 @@ int main(int argc, char** argv){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -291,8 +291,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){ for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); file << nbCpus << "," << nbGpus << "," << idxNbBlocks << "," << idxSize << "," << (useMultiprio?"TRUE":"FALSE") << "," diff --git a/Benchmark/cholesky_gemm/cholesky-mpi.cpp b/Benchmark/cholesky_gemm/cholesky-mpi.cpp index 203986780f29affb472033ea5017d621dd41b77c..2236bfa43ab3cd11b18ec70cfb2bace903af3603 100644 --- a/Benchmark/cholesky_gemm/cholesky-mpi.cpp +++ b/Benchmark/cholesky_gemm/cholesky-mpi.cpp @@ -71,7 +71,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const else{ scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1 , SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler)); #else SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); #endif diff --git a/Benchmark/cholesky_gemm/cholesky.cpp b/Benchmark/cholesky_gemm/cholesky.cpp index 6e85cb174a53327b43766a09401c95f479b6ca43..e514b44ae641cfe38cb81ef9c4505a4acbefe306 100644 --- a/Benchmark/cholesky_gemm/cholesky.cpp +++ b/Benchmark/cholesky_gemm/cholesky.cpp @@ -437,8 +437,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); @@ -509,8 +509,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); diff --git a/Benchmark/cholesky_gemm/gemm-mpi.cpp b/Benchmark/cholesky_gemm/gemm-mpi.cpp index d2e01e82b04b012688257ed6c6da964095e9dd3e..8632cc3df2c1b9cf2c379d0a3e4e29f7145572b7 100644 --- a/Benchmark/cholesky_gemm/gemm-mpi.cpp +++ b/Benchmark/cholesky_gemm/gemm-mpi.cpp @@ -67,7 +67,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks else{ scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1, SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler)); #else SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); #endif diff --git a/Benchmark/cholesky_gemm/gemm.cpp b/Benchmark/cholesky_gemm/gemm.cpp index 49f7b34db0b54797240c18637808922e690385ab..27e9c07b2a559816f7f3406060b56cefbc213202 100644 --- a/Benchmark/cholesky_gemm/gemm.cpp +++ b/Benchmark/cholesky_gemm/gemm.cpp @@ -269,8 +269,8 @@ int main(int argc, char** argv){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); @@ -342,8 +342,8 @@ int main(int argc, char** argv){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); diff --git a/Benchmark/particles/particles-simu.cpp b/Benchmark/particles/particles-simu.cpp index 94adf1d79692458b03f25d82b917d5422da23b6d..dafb4ecaa6ca4280b10219904bcf5415d3acf46a 100644 --- a/Benchmark/particles/particles-simu.cpp +++ b/Benchmark/particles/particles-simu.cpp @@ -919,8 +919,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -954,8 +954,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); diff --git a/Src/Compute/SpWorkerTeamBuilder.hpp b/Src/Compute/SpWorkerTeamBuilder.hpp index 9c02b7d60401a189cf06bd7ed60fd5417159de1f..833e618675b2ab97a714e61d91e677b2a0dc3d67 100644 --- a/Src/Compute/SpWorkerTeamBuilder.hpp +++ b/Src/Compute/SpWorkerTeamBuilder.hpp @@ -42,9 +42,11 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfCudaWorkers(const int nbWor return res; } -static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkersInit = -1, int nbCudaWorkers = SpCudaUtils::GetNbDevices(), const int nbWorkerPerCudas = SpCudaUtils::GetDefaultNbStreams()) { + const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbCudaWorkers)); + if(SpCudaUtils::GetNbDevices() < nbCudaWorkers){ std::cout << "[SPECX] The number of devices asked (" << nbCudaWorkers << ") is above the real number of devices (" @@ -76,7 +78,7 @@ static auto TeamOfCpuGpuWorkers(Args&& ... args) { } #endif #ifdef SPECX_COMPILE_WITH_HIP -static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHipsInit = SpHipUtils::GetDefaultNbStreams(), int nbHipWorkers = SpHipUtils::GetNbDevices()) { if(SpHipUtils::GetNbDevices() < nbHipWorkers){ std::cout << "[SPECX] The number of devices asked (" @@ -99,9 +101,10 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWork return res; } -static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = -1, const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(), int nbHipWorkers = SpHipUtils::GetNbDevices()) { + const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbWorkerPerHips)); if(SpHipUtils::GetNbDevices() < nbHipWorkers){ std::cout << "[SPECX] The number of devices asked (" << nbHipWorkers << ") is above the real number of devices ("