diff --git a/Batch/run-benchs.sh b/Batch/run-benchs.sh index 5c4c62d8ef99c5687a9a58a8954bc0807a9529bb..f9d9d1224f069b07559d5ed8ab600c0142d892f7 100644 --- a/Batch/run-benchs.sh +++ b/Batch/run-benchs.sh @@ -43,8 +43,6 @@ function main(){ return 1 fi - cd "/home/bramas/spetabaru-project/results/" - # Create a directory to store the results # with the format results_[date]_[time] results_dir="results-$PREFIX-$(date +%Y%m%d_%H%M%S)" @@ -56,18 +54,18 @@ function main(){ NB_LOOPS=10 # AXPY - "$RUN_DIR/Benchmark/axpy/axpy" --lp=$NB_LOOPS --minnbb=16 --maxnbb=256 --minbs=128 --maxbs=65536 --gputh=256 --od="$results_dir" >> "$results_dir/output_axpy.txt" + "$RUN_DIR/Benchmark/axpy/axpy" --lp=$NB_LOOPS --minnbb=256 --maxnbb=1024 --minbs=131072 --maxbs=524288 --gputh=256 --od="$results_dir" >> "$results_dir/output_axpy.txt" remove_core_files # Cholesky/gemm - "$RUN_DIR/Benchmark/cholesky_gemm/cholesky" --lp=$NB_LOOPS --minms=4096 --maxms=8192 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_cholesky.txt" + "$RUN_DIR/Benchmark/cholesky_gemm/cholesky" --lp=$NB_LOOPS --minms=4096 --maxms=16384 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_cholesky.txt" remove_core_files - "$RUN_DIR/Benchmark/cholesky_gemm/gemm" --lp=$NB_LOOPS --minms=4096 --maxms=8192 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_gemm.txt" + "$RUN_DIR/Benchmark/cholesky_gemm/gemm" --lp=$NB_LOOPS --minms=4096 --maxms=16384 --minbs=128 --maxbs=512 --od="$results_dir" >> "$results_dir/output_gemm.txt" remove_core_files # Particles - "$RUN_DIR/Benchmark/particles/particles-simu" --lp=$NB_LOOPS --minp=500 --maxp=8000 --minnbgroups=128 --maxnbgroups=512 --od="$results_dir" >> "$results_dir/output_particles.txt" + "$RUN_DIR/Benchmark/particles/particles-simu" --lp=$NB_LOOPS --minp=500 --maxp=512 --minnbgroups=128 --maxnbgroups=1024 --od="$results_dir" >> "$results_dir/output_particles.txt" remove_core_files } diff --git a/Benchmark/axpy/axpy.cpp b/Benchmark/axpy/axpy.cpp index de4f1b31507807b95fb331ea63a8b809184b0c46..448bd33c1cc5116a96f86f6097efe713bc08ee2e 100644 --- a/Benchmark/axpy/axpy.cpp +++ b/Benchmark/axpy/axpy.cpp @@ -76,7 +76,8 @@ __global__ void cu_axpy(int n, NumType a, NumType *x, NumType *y, NumType *out) #endif -auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const int blocksize, const int gpunbthreads, +template <int MaxNbDevices, const bool FavorLocality> +auto BenchmarkTest(const int NbLoops, const int nbCpu, const int nbGpu, const int nbblocks, const int blocksize, const int gpunbthreads, const bool useMultiPrioScheduler){ std::vector<Vector<float>> x(nbblocks, Vector<float>(blocksize, 1)); std::vector<Vector<float>> y(nbblocks, Vector<float>(blocksize, 1)); @@ -89,11 +90,11 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler)); #else - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu)); #endif std::vector<double> minMaxAvg(3); @@ -123,6 +124,7 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const const int gpunbblocks = (size + gpunbthreads-1)/gpunbthreads; cu_axpy<float><<<gpunbblocks, gpunbthreads,0,SpCudaUtils::GetCurrentStream()>>> (size, a, (float*)paramX.getRawPtr(), (float*)paramY.getRawPtr(), (float*)paramZ.getRawPtr()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -133,6 +135,7 @@ auto BenchmarkTest(const int NbLoops, const int nbGpu, const int nbblocks, const const int gpunbblocks = (size + gpunbthreads-1)/gpunbthreads; hipLaunchKernelGGL( cu_axpy<float>, gpunbblocks, gpunbthreads,0,SpHipUtils::GetCurrentStream(), size, a, (float*)paramX.getRawPtr(), (float*)paramY.getRawPtr(), (float*)paramZ.getRawPtr()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -242,13 +245,27 @@ int main(int argc, char** argv){ std::vector<std::vector<double>> allDurations; - for(bool useMultiprio: std::vector<bool>{true, false}){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + const auto schedPairConf = std::vector<std::tuple<bool,bool,bool>>{std::make_tuple(false, false,false), + std::make_tuple(true, false, false), + std::make_tuple(true, true, false), + std::make_tuple(true, true, true), + std::make_tuple(true, false, true)}; + + for(auto useMultiprioAndPairs: schedPairConf){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); + const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); + const bool useLocality = std::get<2>(useMultiprioAndPairs); std::cout << " - Gpu = " << idxGpu << " Multiprio = " << (useMultiprio?"TRUE":"FALSE") << std::endl; for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){ for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){ + std::cout << " - NbBlocks = " << idxNbBlocks << " BlockSize = " << idxSize << std::endl; - const auto minMaxAvg = BenchmarkTest(NbLoops, idxGpu, idxNbBlocks, idxSize, gpunbthreads, useMultiprio); + const auto minMaxAvg = (useLocality ? + BenchmarkTest<8,true>(NbLoops, nbCpus, nbGpus, idxNbBlocks, idxSize, gpunbthreads, useMultiprio) + : BenchmarkTest<8,false>(NbLoops, nbCpus, nbGpus, idxNbBlocks, idxSize, gpunbthreads, useMultiprio)); std::cout << " - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl; std::cout << " - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl; std::cout << " - End" << std::endl; @@ -265,16 +282,22 @@ int main(int argc, char** argv){ return 1; } - file << "NbGpu,NbBlocks,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; + file << "NbCpus,NbGpu,NbBlocks,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; int idxDuration = 0; - for(bool useMultiprio: std::vector<bool>{true, false}){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(auto useMultiprioAndPairs: schedPairConf){ + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); + const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); + const bool useLocality = std::get<2>(useMultiprioAndPairs); + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){ for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){ - file << idxGpu << "," << idxNbBlocks << "," << idxSize << "," - << (useMultiprio?"TRUE":"FALSE") << "," - << "FALSE" << "," - << "FALSE" << "," + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + + file << nbCpus << "," << nbGpus << "," << idxNbBlocks << "," << idxSize << "," + << (useMultiprio?"TRUE":"FALSE") << "," + << (usePrioPairs?"TRUE":"FALSE") << "," + << (useLocality?"TRUE":"FALSE") << "," << allDurations[idxDuration][0] << "," << allDurations[idxDuration][1] << "," << allDurations[idxDuration][2] << "," diff --git a/Benchmark/cholesky_gemm/CMakeLists.txt b/Benchmark/cholesky_gemm/CMakeLists.txt index c743c852249f13f08024ec0ba70091cbedea257e..55f4cc58a498d1dc48b602591d176a5a2150bb6c 100644 --- a/Benchmark/cholesky_gemm/CMakeLists.txt +++ b/Benchmark/cholesky_gemm/CMakeLists.txt @@ -14,6 +14,13 @@ find_package(BLAS) find_package(LAPACK) if(BLAS_FOUND AND LAPACK_FOUND) + # If libs is related to MKL + if(${BLAS_LIBRARIES} MATCHES "mkl") + # We replace "intel_thread" by "sequential" + string(REPLACE "intel_thread" "sequential" BLAS_LIBRARIES ${BLAS_LIBRARIES}) + string(REPLACE "intel_thread" "sequential" LAPACK_LIBRARIES ${LAPACK_LIBRARIES}) + endif() + if($ENV{VERBOSE}) MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LIBRARIES : ${BLAS_LIBRARIES}") MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LINKER_FLAGS : ${BLAS_LINKER_FLAGS}") diff --git a/Benchmark/cholesky_gemm/cholesky-mpi.cpp b/Benchmark/cholesky_gemm/cholesky-mpi.cpp index b30af01dea959c69534b205e60d2f13eed3a1fc8..203986780f29affb472033ea5017d621dd41b77c 100644 --- a/Benchmark/cholesky_gemm/cholesky-mpi.cpp +++ b/Benchmark/cholesky_gemm/cholesky-mpi.cpp @@ -69,7 +69,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); #else @@ -165,6 +165,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const handle.solverBuffer, handle.solverBuffSize, handle.cuinfo)); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); #ifndef NDEBUG int info; CUDA_ASSERT(cudaMemcpy(&info, handle.cuinfo, sizeof(int), cudaMemcpyDeviceToHost)); @@ -183,6 +184,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const handle.solverBuffer, handle.solverBuffSize, handle.cuinfo)); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); #ifndef NDEBUG int info; HIP_ASSERT(hipMemcpy(&info, handle.cuinfo, sizeof(int), hipMemcpyDeviceToHost)); @@ -216,6 +218,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (double*)paramB.getRawPtr(), inBlockDim)); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -226,6 +229,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (double*)paramB.getRawPtr(), inBlockDim)); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("TRSM -- (R-")+std::to_string(k)+","+std::to_string(k)+") (W-"+std::to_string(m)+","+std::to_string(k)+")"); @@ -256,6 +260,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const CUBLAS_ASSERT( cublasDsyrk( handle.blasHandle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim, &one, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -267,6 +272,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const HIPBLAS_ASSERT(hipblasDsyrk( handle.blasHandle, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim, &one, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("SYRK -- (R-")+std::to_string(n)+","+std::to_string(k)+") (W-"+std::to_string(n)+","+std::to_string(n)+")"); @@ -296,6 +302,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &minusone, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -307,6 +314,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &minusone, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (R-")+std::to_string(m)+","+std::to_string(k)+")(R-"+std::to_string(n)+","+std::to_string(k)+")(W-"+std::to_string(m)+","+std::to_string(n)+")"); diff --git a/Benchmark/cholesky_gemm/cholesky.cpp b/Benchmark/cholesky_gemm/cholesky.cpp index 26764d6314e757cd1e3e66f3a39881864af58dc7..6e85cb174a53327b43766a09401c95f479b6ca43 100644 --- a/Benchmark/cholesky_gemm/cholesky.cpp +++ b/Benchmark/cholesky_gemm/cholesky.cpp @@ -58,7 +58,7 @@ thread_local HipHandles handle; template <int MaxNbDevices, const bool FavorLocality> auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const int inMatrixDim, const int inBlockDim, - const int nbGpu, const bool useMultiPrioScheduler, const bool usePrioPairs){ + const int nbCpu, const int nbGpu, const bool useMultiPrioScheduler, const bool usePrioPairs){ const int nbBlocks = (inMatrixDim+inBlockDim-1)/inBlockDim; #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP) @@ -67,11 +67,11 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler)); #else - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu)); #endif #ifdef SPECX_COMPILE_WITH_CUDA @@ -162,6 +162,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const handle.solverBuffer, handle.solverBuffSize, handle.cuinfo)); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); #ifndef NDEBUG int info; CUDA_ASSERT(cudaMemcpy(&info, handle.cuinfo, sizeof(int), cudaMemcpyDeviceToHost)); @@ -180,6 +181,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const handle.solverBuffer, handle.solverBuffSize, handle.cuinfo)); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); #ifndef NDEBUG int info; HIP_ASSERT(hipMemcpy(&info, handle.cuinfo, sizeof(int), hipMemcpyDeviceToHost)); @@ -206,6 +208,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (double*)paramB.getRawPtr(), inBlockDim)); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -216,6 +219,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (double*)paramB.getRawPtr(), inBlockDim)); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("TRSM -- (R-")+std::to_string(k)+","+std::to_string(k)+") (W-"+std::to_string(m)+","+std::to_string(k)+")"); @@ -239,6 +243,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const CUBLAS_ASSERT( cublasDsyrk( handle.blasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim, &one, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -249,6 +254,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const HIPBLAS_ASSERT(hipblasDsyrk( handle.blasHandle, HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, inBlockDim, inBlockDim, &minusone, (const double*)paramA.getRawPtr(), inBlockDim, &one, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("SYRK -- (R-")+std::to_string(n)+","+std::to_string(k)+") (W-"+std::to_string(n)+","+std::to_string(n)+")"); @@ -271,6 +277,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &minusone, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -282,6 +289,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const inBlockDim, inBlockDim, inBlockDim, &one, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &minusone, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (R-")+std::to_string(m)+","+std::to_string(k)+")(R-"+std::to_string(n)+","+std::to_string(k)+")(W-"+std::to_string(m)+","+std::to_string(n)+")"); @@ -426,9 +434,12 @@ int main(int argc, char** argv){ std::make_tuple(true, false, true)}; for(auto useMultiprioAndPairs: schedPairConf){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -460,9 +471,9 @@ int main(int argc, char** argv){ ///////////////////////////////////////////////////////// const auto minMaxAvg = (useLocality ? choleskyFactorization<8,true>(NbLoops, blocks.get(), MatrixSize, BlockSize, - idxGpu, useMultiprio, usePrioPairs) : + nbCpus, nbGpus, useMultiprio, usePrioPairs) : choleskyFactorization<8,false>(NbLoops, blocks.get(), MatrixSize, BlockSize, - idxGpu, useMultiprio, usePrioPairs)); + nbCpus, nbGpus, useMultiprio, usePrioPairs)); allDurations.push_back(minMaxAvg); std::cout << " - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl; std::cout << " - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl; @@ -492,17 +503,20 @@ int main(int argc, char** argv){ return 1; } - file << "NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; + file << "NbCpu,NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; int idxDuration = 0; for(auto useMultiprioAndPairs: schedPairConf){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); - file << idxGpu << "," << MatrixSize << "," << BlockSize << "," + file << nbCpus << "," << nbGpus << "," << MatrixSize << "," << BlockSize << "," << (useMultiprio?"TRUE":"FALSE") << "," << (usePrioPairs?"TRUE":"FALSE") << "," << (useLocality?"TRUE":"FALSE") << "," diff --git a/Benchmark/cholesky_gemm/gemm-mpi.cpp b/Benchmark/cholesky_gemm/gemm-mpi.cpp index 6ce8348396184e87898d49caabf110bd7eab2618..d2e01e82b04b012688257ed6c6da964095e9dd3e 100644 --- a/Benchmark/cholesky_gemm/gemm-mpi.cpp +++ b/Benchmark/cholesky_gemm/gemm-mpi.cpp @@ -65,7 +65,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); #else @@ -138,7 +138,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks for(int k = 0 ; k < processBlockDim ; ++k){ for(int i = 0 ; i < processBlockDim ; ++i){ for(int j = 0 ; j < processBlockDim ; ++j){ - tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]), + tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]), SpRead(blocksA[k*processBlockDim+j]), SpRead(blocksB[i*processBlockDim+k]), SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){ SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL, @@ -155,6 +155,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -166,6 +167,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")"); @@ -186,7 +188,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks for(int k = 0 ; k < processBlockDim ; ++k){ for(int i = 0 ; i < processBlockDim ; ++i){ for(int j = 0 ; j < processBlockDim ; ++j){ - tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]), + tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]), SpRead(blocksA[k*processBlockDim+j]), SpRead(buffersB[i*processBlockDim+k]), SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){ SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL, @@ -203,6 +205,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -214,6 +217,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")"); @@ -236,7 +240,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks for(int k = 0 ; k < processBlockDim ; ++k){ for(int i = 0 ; i < processBlockDim ; ++i){ for(int j = 0 ; j < processBlockDim ; ++j){ - tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*processBlockDim+j]), + tg.task(SpPriority(1), SpWrite(blocksC[i*processBlockDim+j]), SpRead(buffersA[k*processBlockDim+j]), SpRead(blocksB[i*processBlockDim+k]), SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){ SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL, @@ -253,6 +257,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -264,6 +269,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")"); diff --git a/Benchmark/cholesky_gemm/gemm.cpp b/Benchmark/cholesky_gemm/gemm.cpp index 0098fc30624fa229786f49e48c7b75923c24d974..49f7b34db0b54797240c18637808922e690385ab 100644 --- a/Benchmark/cholesky_gemm/gemm.cpp +++ b/Benchmark/cholesky_gemm/gemm.cpp @@ -46,8 +46,9 @@ thread_local hipblasHandle_t handle; template <int MaxNbDevices, const bool FavorLocality> auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocksA[], const SpBlas::Block blocksB[], const int inMatrixDim, const int inBlockDim, - const int nbGpu, const bool useMultiPrioScheduler){ + const int nbCpus, const int nbGpu, const bool useMultiPrioScheduler){ const int nbBlocks = (inMatrixDim+inBlockDim-1)/inBlockDim; + const bool exportTrace = false; #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP) std::unique_ptr<SpAbstractScheduler> scheduler; @@ -55,11 +56,11 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpus, nbGpu), std::move(scheduler)); #else - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpus)); #endif #ifdef SPECX_COMPILE_WITH_CUDA @@ -99,7 +100,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks for(int i = 0 ; i < nbBlocks ; ++i){ for(int j = 0 ; j < nbBlocks ; ++j){ for(int k = 0 ; k < nbBlocks ; ++k){ - tg.task(SpPriority(1), SpCommutativeWrite(blocksC[i*nbBlocks+j]), + tg.task(SpPriority(1), SpWrite(blocksC[i*nbBlocks+j]), SpRead(blocksA[k*nbBlocks+j]), SpRead(blocksB[i*nbBlocks+k]), SpCpu([inBlockDim](SpBlas::Block& blockC, const SpBlas::Block& blockA, const SpBlas::Block& blockB){ SpBlas::gemm( SpBlas::Transa::NORMAL, SpBlas::Transa::NORMAL, @@ -116,6 +117,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -127,6 +129,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks inBlockDim, inBlockDim, inBlockDim, &alphaBeta, (const double*)paramA.getRawPtr(), inBlockDim, (const double*)paramB.getRawPtr(), inBlockDim, &alphaBeta, (double*)paramC.getRawPtr(), inBlockDim ) ); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ).setTaskName(std::string("GEMM -- (")+std::to_string(i)+","+std::to_string(j)+")"); @@ -138,6 +141,11 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks tg.waitAllTasks(); timer.stop(); + if(exportTrace){ + static int cptTrace = 0; + tg.generateTrace("./trace" + std::to_string(cptTrace++) + "-" + std::to_string(idxLoop) + ".svg"); + } + #if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP) for(int i = 0 ; i < nbBlocks ; ++i){ for(int j = 0 ; j < nbBlocks ; ++j){ @@ -260,7 +268,10 @@ int main(int argc, char** argv){ for(auto useMultiprioAndPairs: schedPairConf){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); @@ -293,9 +304,9 @@ int main(int argc, char** argv){ auto blocksC = SpBlas::matrixToBlock(matrixC.get(), MatrixSize, BlockSize); const auto minMaxAvg = (useLocality ? gemm<8,true>(NbLoops, blocksC.get(), blocksA.get(), blocksB.get(), - MatrixSize, BlockSize, idxGpu, useMultiprio): + MatrixSize, BlockSize, nbCpus, nbGpus, useMultiprio): gemm<8,false>(NbLoops, blocksC.get(), blocksA.get(), blocksB.get(), - MatrixSize, BlockSize, idxGpu, useMultiprio)); + MatrixSize, BlockSize, nbCpus, nbGpus, useMultiprio)); allDurations.push_back(minMaxAvg); std::cout << " - Duration = " << minMaxAvg[0] << " " << minMaxAvg[1] << " " << minMaxAvg[2] << std::endl; std::cout << " - Transfers = " << minMaxAvg[3] << " " << minMaxAvg[4] << " " << minMaxAvg[5] << " " << minMaxAvg[6] << " " << minMaxAvg[7] << std::endl; @@ -325,16 +336,19 @@ int main(int argc, char** argv){ return 1; } - file << "NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; + file << "NbCpu,NbGpu,MatrixSize,BlockSize,Multiprio,PrioPair,FavorLocality,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; int idxDuration = 0; for(auto useMultiprioAndPairs: schedPairConf){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); - file << idxGpu << "," << MatrixSize << "," << BlockSize << "," + file << nbCpus << "," << nbGpus << "," << MatrixSize << "," << BlockSize << "," << (useMultiprio?"TRUE":"FALSE") << "," << "FALSE" << "," << (useLocality?"TRUE":"FALSE") << "," diff --git a/Benchmark/particles/particles-simu-mpi.cpp b/Benchmark/particles/particles-simu-mpi.cpp index 8094384d74066246a418cd365f1ff26d2d1fdcfe..a5afc99ed2cc8223941d6210885550677d5f834f 100644 --- a/Benchmark/particles/particles-simu-mpi.cpp +++ b/Benchmark/particles/particles-simu-mpi.cpp @@ -531,12 +531,14 @@ void AccuracyTest(){ , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP , SpHip([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -549,12 +551,14 @@ void AccuracyTest(){ , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP , SpHip([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -571,6 +575,7 @@ void AccuracyTest(){ paramA.getRawPtr(), paramA.getRawSize()); p2p_neigh_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -581,6 +586,7 @@ void AccuracyTest(){ paramA.getRawPtr(), paramA.getRawSize()); hipLaunchKernelGGL(p2p_neigh_gpu, dim3(10), dim3(10), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -784,7 +790,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(), std::move(scheduler)); #else @@ -819,6 +825,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,SpCudaUtils::GetCurrentStream()>>> (paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -826,6 +833,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, dim3(inKernelConfig.nbBlocksInner), dim3(inKernelConfig.nbThreadsInner), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -847,6 +855,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>> (paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -857,6 +866,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -874,6 +884,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles(); p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>> (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -882,6 +893,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles(); hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(), paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -897,6 +909,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles(); p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>> (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -905,6 +918,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticlesB = paramB.data().getNbParticles(); hipLaunchKernelGGL(p2p_neigh_gpu, dim3(inKernelConfig.nbBlocksOuter), dim3(inKernelConfig.nbThreadsOuter), 0, SpHipUtils::GetCurrentStream(), paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); diff --git a/Benchmark/particles/particles-simu.cpp b/Benchmark/particles/particles-simu.cpp index 567490257caf5942b3bc556018fe7a0769d9911d..94adf1d79692458b03f25d82b917d5422da23b6d 100644 --- a/Benchmark/particles/particles-simu.cpp +++ b/Benchmark/particles/particles-simu.cpp @@ -465,6 +465,7 @@ void AccuracyTest(){ , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -472,6 +473,7 @@ void AccuracyTest(){ [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -484,6 +486,7 @@ void AccuracyTest(){ , SpCuda([](SpDeviceDataView<ParticlesGroup> paramA) { [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -491,6 +494,7 @@ void AccuracyTest(){ [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -507,6 +511,7 @@ void AccuracyTest(){ paramA.getRawPtr(), paramA.getRawSize()); p2p_neigh_gpu<<<10,10,0,SpCudaUtils::GetCurrentStream()>>>(paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -519,6 +524,7 @@ void AccuracyTest(){ hipLaunchKernelGGL(p2p_neigh_gpu, 10, 10, 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -698,7 +704,7 @@ auto GetPriority(const bool usePrioPairs, const int maxInteractions, const int m template <int MaxNbDevices, const bool FavorLocality> auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPartsPerGroup, - const int NbGroups, const int nbGpu, const bool useMultiPrioScheduler, + const int NbGroups, const int nbCpu, const int nbGpu, const bool useMultiPrioScheduler, const bool usePrioPairs, const TuneResult& inKernelConfig, const int maxInteractions, const int minInteractions){ @@ -719,11 +725,11 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart scheduler = std::unique_ptr<SpAbstractScheduler>(new SpHeterogeneousPrioScheduler()); } else{ - scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>()); + scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(nbCpu, nbGpu), std::move(scheduler)); #else - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers(nbCpu)); #endif std::vector<double> minMaxAvg(3); @@ -749,6 +755,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,SpCudaUtils::GetCurrentStream()>>> (paramA.getRawPtr(), paramA.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -756,6 +763,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart [[maybe_unused]] const std::size_t nbParticles = paramA.data().getNbParticles(); hipLaunchKernelGGL(p2p_inner_gpu, inKernelConfig.nbBlocksInner, inKernelConfig.nbThreadsInner, 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -780,6 +788,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart (paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,SpCudaUtils::GetCurrentStream()>>> (paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); }) #endif #ifdef SPECX_COMPILE_WITH_HIP @@ -790,6 +799,7 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart paramB.getRawPtr(), paramB.getRawSize(), paramA.getRawPtr(), paramA.getRawSize()); hipLaunchKernelGGL(p2p_neigh_gpu, inKernelConfig.nbBlocksOuter, inKernelConfig.nbThreadsOuter, 0, SpHipUtils::GetCurrentStream(), paramA.getRawPtr(), paramA.getRawSize(), paramB.getRawPtr(), paramB.getRawSize()); + HIP_ASSERT(hipStreamSynchronize(SpHipUtils::GetCurrentStream())); }) #endif ); @@ -907,8 +917,10 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ std::make_tuple(true, false, true)}; for(auto useMultiprioAndPairs: schedPairConf){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -919,10 +931,10 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ << " Favor Loc = " << useLocality << std::endl; const auto minMaxAvg = (useLocality ? BenchCore<8,true>(NbLoops, MinPartsPerGroup, - MaxPartsPerGroup, idxBlock, idxGpu, useMultiprio, usePrioPairs, inKernelConfig, + MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, useMultiprio, usePrioPairs, inKernelConfig, maxInteractions, minInteractions): BenchCore<8,false>(NbLoops, MinPartsPerGroup, - MaxPartsPerGroup, idxBlock, idxGpu, useMultiprio, usePrioPairs, inKernelConfig, + MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, useMultiprio, usePrioPairs, inKernelConfig, maxInteractions, minInteractions)); allDurations.push_back(minMaxAvg); std::cout << " - Min = " << minMaxAvg[0] << " Max = " << minMaxAvg[1] << " Avg = " << minMaxAvg[2] << std::endl; @@ -937,16 +949,18 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ return; } - file << "NbGpu,BlockSize,Multiprio,Multiprio,PrioPair,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; + file << "NbCpu,NbGpu,BlockSize,Multiprio,Multiprio,PrioPair,MinDuration,MaxDuration,AvgDuration,TotalTransfer,MaxTransfer,DeviceToHostTransfer,HostToDeviceTransfer,DeviceToDeviceTransfer" << std::endl; int idxDuration = 0; for(auto useMultiprioAndPairs: schedPairConf){ - for(int idxGpu = 0 ; idxGpu <= nbGpus ; ++idxGpu){ + for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); - file << idxGpu << "," << idxBlock << "," + file << nbCpus << "," << nbGpus << "," << idxBlock << "," << (useMultiprio?"TRUE":"FALSE") << "," << (usePrioPairs?"TRUE":"FALSE") << "," << (useLocality?"TRUE":"FALSE") << "," diff --git a/Src/Cuda/SpCudaMemManager.cpp b/Src/Cuda/SpCudaMemManager.cpp index a9a72be3147a018a7c3c786e4894aa2841a78bbb..0264a950a1498e52315c1531fab7760dd4965997 100644 --- a/Src/Cuda/SpCudaMemManager.cpp +++ b/Src/Cuda/SpCudaMemManager.cpp @@ -1,4 +1,3 @@ #include "SpCudaMemManager.hpp" std::vector<SpCudaManager::SpCudaMemManager> SpCudaManager::Managers = SpCudaManager::BuildManagers(); -std::mutex SpCudaManager::CudaMutex; diff --git a/Src/Cuda/SpCudaMemManager.hpp b/Src/Cuda/SpCudaMemManager.hpp index 22e7153a1e516a300faee7575bde4d8354605f6a..6820ed700965228615a8d1acff609e623693ad5d 100644 --- a/Src/Cuda/SpCudaMemManager.hpp +++ b/Src/Cuda/SpCudaMemManager.hpp @@ -33,18 +33,8 @@ class SpCudaManager { int useCount = 0; }; - static std::mutex CudaMutex; - public: - static void Lock(){// Do finer lock TODO - CudaMutex.lock(); - } - - static void Unlock(){ - CudaMutex.unlock(); - } - class SpCudaMemManager : public SpAbstractDeviceMemManager { const int id; std::unordered_map<void*, HandleDescr> handles; @@ -62,11 +52,22 @@ public: size_t hostToDeviceTransfers; size_t deviceToDeviceTransfers; + std::unique_ptr<std::mutex> cudaMutex; + public: + void lock(){// Do finer lock TODO + cudaMutex->lock(); + } + + void unlock(){ + cudaMutex->unlock(); + } + explicit SpCudaMemManager(const int inId) : id(inId), deferCopier(new SpConsumerThread), remainingMemory(0), totalAllocatedMemory(0), currentAllocatedMemory(0), maxAllocatedMemory(0), - deviceToHostTransfers(0), hostToDeviceTransfers(0), deviceToDeviceTransfers(0){ + deviceToHostTransfers(0), hostToDeviceTransfers(0), deviceToDeviceTransfers(0), + cudaMutex(new std::mutex){ deferCopier->submitJobAndWait([this]{ SpCudaUtils::UseDevice(id); @@ -106,7 +107,7 @@ public: SpCudaMemManager(SpCudaMemManager&&) = default; SpCudaMemManager& operator=(const SpCudaMemManager&) = delete; - SpCudaMemManager& operator=(SpCudaMemManager&&) = delete; + SpCudaMemManager& operator=(SpCudaMemManager&&) = default; void incrDeviceDataUseCount(void* key) override { assert(handles.find(key) != handles.end()); @@ -123,7 +124,8 @@ public: handles[key].useCount -= 1; } - bool hasBeenRemoved(void* key){ + bool hasBeenRemovedLocked(void* key){ + const std::lock_guard<std::mutex> lock(*cudaMutex); return (handles.find(key) == handles.end()); } @@ -164,10 +166,14 @@ public: if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ CUDA_ASSERT(cudaMallocAsync(&data.ptr, inByteSize, SpCudaUtils::GetCurrentStream())); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ CUDA_ASSERT(cudaMallocAsync(&data.ptr, inByteSize, extraStream)); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } @@ -206,11 +212,11 @@ public: handles.erase(key); if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ - CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ - CUDA_ASSERT(cudaStreamSynchronize(extraStream)); + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } @@ -222,10 +228,14 @@ public: && allBlocks[inPtrDev].size <= inByteSize); if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ CUDA_ASSERT(cudaMemsetAsync(inPtrDev, val, inByteSize, SpCudaUtils::GetCurrentStream())); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ CUDA_ASSERT(cudaMemsetAsync(inPtrDev, val, inByteSize, extraStream)); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } } @@ -237,13 +247,18 @@ public: if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ CUDA_ASSERT(cudaMemcpyAsync(inPtrDev, inPtrHost, inByteSize, cudaMemcpyHostToDevice, SpCudaUtils::GetCurrentStream())); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ CUDA_ASSERT(cudaMemcpyAsync(inPtrDev, inPtrHost, inByteSize, cudaMemcpyHostToDevice, extraStream)); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } + hostToDeviceTransfers += inByteSize; } @@ -254,14 +269,17 @@ public: if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ CUDA_ASSERT(cudaMemcpyAsync(inPtrHost, inPtrDev, inByteSize, cudaMemcpyDeviceToHost, SpCudaUtils::GetCurrentStream())); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ CUDA_ASSERT(cudaMemcpyAsync(inPtrHost, inPtrDev, inByteSize, cudaMemcpyDeviceToHost, extraStream)); - CUDA_ASSERT(cudaStreamSynchronize(extraStream)); + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } + deviceToHostTransfers += inByteSize; } @@ -276,14 +294,17 @@ public: if(SpCudaUtils::CurrentWorkerIsCuda() && SpCudaUtils::CurrentCudaId() == id){ CUDA_ASSERT(cudaMemcpyPeerAsync(inPtrDevDest, id, inPtrDevSrc, srcId, inByteSize, SpCudaUtils::GetCurrentStream())); + // Should not be needed, but remove it and it will fail + // CUDA_ASSERT(cudaStreamSynchronize(SpCudaUtils::GetCurrentStream())); } else{ deferCopier->submitJobAndWait([&,this]{ CUDA_ASSERT(cudaMemcpyPeerAsync(inPtrDevDest, id, inPtrDevSrc, srcId, inByteSize, extraStream)); - CUDA_ASSERT(cudaStreamSynchronize(extraStream)); + // CUDA_ASSERT(cudaStreamSynchronize(extraStream)); }); } + deviceToDeviceTransfers += inByteSize; } diff --git a/Src/Data/SpDataHandle.hpp b/Src/Data/SpDataHandle.hpp index 218c8aa10641decd6165e48075c9c595eef13a5d..0ec681a557021e52e4cf6af071f8f31ceddb1070 100644 --- a/Src/Data/SpDataHandle.hpp +++ b/Src/Data/SpDataHandle.hpp @@ -109,7 +109,9 @@ public: assert(cpuDataOk == true); for(int idxGpu = 0 ; idxGpu < int(copies.size()) ; ++idxGpu){ if(copies[idxGpu].ptr){ + memManagers[idxGpu].lock(); deviceDataOp->freeGroup(memManagers[idxGpu], this, ptrToData); + memManagers[idxGpu].unlock(); copies[idxGpu] = SpDeviceData(); } } @@ -122,7 +124,9 @@ public: cpuDataOk = false; for(int idxGpu = 0 ; idxGpu < int(copies.size()) ; ++idxGpu){ if(idxGpu != gpuId && copies[idxGpu].ptr){ + memManagers[idxGpu].lock(); deviceDataOp->freeGroup(memManagers[idxGpu], this, ptrToData); + memManagers[idxGpu].unlock(); copies[idxGpu] = SpDeviceData(); } } @@ -148,7 +152,9 @@ public: assert(gpuId < memManagers.size() && gpuId < int(copies.size())); assert(copies[gpuId].ptr); syncCpuDataIfNeeded(memManagers); + //memManagers[idxGpu].lock(); deviceDataOp->freeGroup(memManagers[gpuId], this, ptrToData); + //memManagers[idxGpu].unlock(); copies[gpuId] = SpDeviceData(); } @@ -160,11 +166,12 @@ public: #elif defined(SPECX_COMPILE_WITH_HIP) assert(gpuId < SpConfig::SpMaxNbHips); #endif - if(copies[gpuId].ptr == nullptr || memManagers[gpuId].hasBeenRemoved(this)){ + if(copies[gpuId].ptr == nullptr || memManagers[gpuId].hasBeenRemovedLocked(this)){ copies[gpuId].ptr = nullptr; auto idxGpuSrcIter = std::find_if(copies.begin(), copies.end(), [](auto iter) -> bool { return iter.ptr != nullptr; }); + memManagers[gpuId].lock(); if(!deviceDataOp->hasEnoughSpace(memManagers[gpuId], this, ptrToData)){ auto candidates = deviceDataOp->candidatesToBeRemoved(memManagers[gpuId], this, ptrToData); for(auto toRemove : candidates){ @@ -175,6 +182,8 @@ public: } } copies[gpuId] = deviceDataOp->allocate(memManagers[gpuId], this, ptrToData); + memManagers[gpuId].unlock(); + if(idxGpuSrcIter != copies.end()){ const int otherGpu = int(std::distance(copies.begin(), idxGpuSrcIter)); if(memManagers[gpuId].isConnectedTo(otherGpu)){ diff --git a/Src/Scheduler/SpMultiPrioScheduler.hpp b/Src/Scheduler/SpMultiPrioScheduler.hpp index 066e702b8fb1a634a9252768d4d0484108dc3e6a..31bd59ae141883da021abae03b2229d374f198f6 100644 --- a/Src/Scheduler/SpMultiPrioScheduler.hpp +++ b/Src/Scheduler/SpMultiPrioScheduler.hpp @@ -42,6 +42,8 @@ class SpMultiPrioScheduler : public SpAbstractScheduler{ std::priority_queue<TaskWrapper, small_vector<TaskWrapper>, ComparePrio > taskQueuesCPU; std::atomic<int> nbTasks[int(SpWorkerTypes::Type::NB_WORKER_TYPES)]; + std::atomic<int> nbCommonTasks; + int nbGpuWorkers; void decrTaskCounter(SpAbstractTask* task){ const bool hasCpuCallable = task->hasCallableOfType(SpCallableType::CPU); @@ -60,13 +62,25 @@ class SpMultiPrioScheduler : public SpAbstractScheduler{ nbTasks[int(SpWorkerTypes::Type::HIP_WORKER)] -= 1; #endif } + if(hasCpuCallable && hasGpuCallable){ + nbCommonTasks -= 1; + } + } + + bool CpuCanTakeTask() const { +#if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP) + return nbCommonTasks > 80 * nbGpuWorkers + || nbTasks[int(SpWorkerTypes::Type::CPU_WORKER)] != nbCommonTasks; +#endif + return true; } public: - explicit SpMultiPrioScheduler() { + explicit SpMultiPrioScheduler(const int inNbGpuWorkers) : nbGpuWorkers(inNbGpuWorkers) { for(int idx = 0 ; idx < int(SpWorkerTypes::Type::NB_WORKER_TYPES) ; ++idx){ nbTasks[idx] = 0; } + nbCommonTasks = 0; } // No copy or move @@ -106,6 +120,9 @@ public: nbTasks[int(SpWorkerTypes::Type::HIP_WORKER)] += 1; #endif } + if(hasCpuCallable && hasGpuCallable){ + nbCommonTasks += 1; + } std::shared_ptr<std::atomic<bool>> taken = std::make_shared<std::atomic<bool>>(false); @@ -145,17 +162,19 @@ public: assert(deviceId == -1 || wt != SpWorkerTypes::Type::CPU_WORKER); if(deviceId == -1){ - mutexReadyTasksCPU.lock(); - while(taskQueuesCPU.size()){ - TaskWrapper taskWrapper = taskQueuesCPU.top(); - taskQueuesCPU.pop(); - if(taskWrapper.taken->exchange(true) == false){ - mutexReadyTasksCPU.unlock(); - decrTaskCounter(taskWrapper.task); - return taskWrapper.task; + if(CpuCanTakeTask()){ + mutexReadyTasksCPU.lock(); + while(taskQueuesCPU.size()){ + TaskWrapper taskWrapper = taskQueuesCPU.top(); + taskQueuesCPU.pop(); + if(taskWrapper.taken->exchange(true) == false){ + mutexReadyTasksCPU.unlock(); + decrTaskCounter(taskWrapper.task); + return taskWrapper.task; + } } + mutexReadyTasksCPU.unlock(); } - mutexReadyTasksCPU.unlock(); } else{ mutexReadyTasksGPU[deviceId].lock(); diff --git a/Src/Task/SpTask.hpp b/Src/Task/SpTask.hpp index d82f1002482d020e17b6edcae228f591267639ad..30e54ad3baa83f6195cf1f57b0c7ab4c00eb56c3 100644 --- a/Src/Task/SpTask.hpp +++ b/Src/Task/SpTask.hpp @@ -101,7 +101,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { void preTaskExecution([[maybe_unused]] SpCallableType ct) final { #ifdef SPECX_COMPILE_WITH_CUDA - SpCudaManager::Lock(); std::size_t extraHandlesOffset = 0; SpUtils::foreach_in_tuple( @@ -143,7 +142,9 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { else{ SpCudaUtils::SyncCurrentStream(); } + SpCudaManager::Managers[cudaId].lock(); SpCudaManager::Managers[cudaId].incrDeviceDataUseCount(h); + SpCudaManager::Managers[cudaId].unlock(); std::get<index>(cudaCallableArgs).reset(dataObj.ptr, dataObj.size); if constexpr(SpDeviceDataUtils::class_has_setDataDescriptor<decltype(std::get<index>(cudaCallableArgs))>::value){ std::get<index>(cudaCallableArgs).setDataDescriptor(dataObj.viewPtr); @@ -160,9 +161,7 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { } }, this->getDataDependencyTupleRef()); - SpCudaManager::Unlock(); #elif defined(SPECX_COMPILE_WITH_HIP) // SPECX_COMPILE_WITH_CUDA - SpHipManager::Lock(); std::size_t extraHandlesOffset = 0; SpUtils::foreach_in_tuple( @@ -221,7 +220,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { } }, this->getDataDependencyTupleRef()); - SpHipManager::Unlock(); #endif // SPECX_COMPILE_WITH_HIP } @@ -266,13 +264,11 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { // Syn only if we the task was on GPU SpCudaUtils::SyncCurrentStream(); } - SpCudaManager::Lock(); #elif defined(SPECX_COMPILE_WITH_HIP) if(ct == SpCallableType::HIP){ // Syn only if we the task was on GPU SpHipUtils::SyncCurrentStream(); } - SpHipManager::Lock(); #endif // SPECX_COMPILE_WITH_HIP std::size_t extraHandlesOffset = 0; @@ -298,16 +294,16 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { #if defined(SPECX_COMPILE_WITH_CUDA) else if(ct == SpCallableType::CUDA){ const int cudaId = SpCudaUtils::CurrentCudaId(); - h->lock(); + SpCudaManager::Managers[cudaId].lock(); SpCudaManager::Managers[cudaId].decrDeviceDataUseCount(h); - h->unlock(); + SpCudaManager::Managers[cudaId].unlock(); } #elif defined(SPECX_COMPILE_WITH_HIP) else if(ct == SpCallableType::HIP){ const int hipId = SpHipUtils::CurrentHipId(); - h->lock(); + SpHipManager::Managers[hipId].lock(); SpHipManager::Managers[hipId].decrDeviceDataUseCount(h); - h->unlock(); + SpHipManager::Managers[hipId].unlock(); } #endif else{ @@ -315,12 +311,6 @@ class SpTask : public SpAbstractTaskWithReturn<RetType> { } } }, this->getDataDependencyTupleRef()); - -#if defined(SPECX_COMPILE_WITH_CUDA) - SpCudaManager::Unlock(); -#elif defined(SPECX_COMPILE_WITH_HIP) - SpHipManager::Unlock(); -#endif #endif // SPECX_COMPILE_WITH_CUDA SPECX_COMPILE_WITH_HIP }