diff --git a/Benchmark/axpy/axpy.cpp b/Benchmark/axpy/axpy.cpp index 448bd33c1cc5116a96f86f6097efe713bc08ee2e..58f1db0e2f2258d1a9adf09d51d12d847a24de43 100644 --- a/Benchmark/axpy/axpy.cpp +++ b/Benchmark/axpy/axpy.cpp @@ -253,8 +253,8 @@ int main(int argc, char** argv){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -291,8 +291,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxNbBlocks = minnbblocks ; idxNbBlocks <= maxnbblocks ; idxNbBlocks *= 2){ for(int idxSize = minblocksize ; idxSize <= maxblocksize ; idxSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); file << nbCpus << "," << nbGpus << "," << idxNbBlocks << "," << idxSize << "," << (useMultiprio?"TRUE":"FALSE") << "," diff --git a/Benchmark/cholesky_gemm/CMakeLists.txt b/Benchmark/cholesky_gemm/CMakeLists.txt index 55f4cc58a498d1dc48b602591d176a5a2150bb6c..2db323656e303f80435578c793a4ca84069365ff 100644 --- a/Benchmark/cholesky_gemm/CMakeLists.txt +++ b/Benchmark/cholesky_gemm/CMakeLists.txt @@ -14,12 +14,28 @@ find_package(BLAS) find_package(LAPACK) if(BLAS_FOUND AND LAPACK_FOUND) - # If libs is related to MKL - if(${BLAS_LIBRARIES} MATCHES "mkl") - # We replace "intel_thread" by "sequential" - string(REPLACE "intel_thread" "sequential" BLAS_LIBRARIES ${BLAS_LIBRARIES}) - string(REPLACE "intel_thread" "sequential" LAPACK_LIBRARIES ${LAPACK_LIBRARIES}) - endif() + # If libs are related to MKL + if("${BLAS_LIBRARIES}" MATCHES "mkl") + # Create new variables to store the modified lists + set(MODIFIED_BLAS_LIBRARIES "") + set(MODIFIED_LAPACK_LIBRARIES "") + + # Loop over each BLAS library and perform the replacement + foreach(lib ${BLAS_LIBRARIES}) + string(REPLACE "intel_thread" "sequential" modified_lib ${lib}) + list(APPEND MODIFIED_BLAS_LIBRARIES ${modified_lib}) + endforeach() + + # Loop over each LAPACK library and perform the replacement + foreach(lib ${LAPACK_LIBRARIES}) + string(REPLACE "intel_thread" "sequential" modified_lib ${lib}) + list(APPEND MODIFIED_LAPACK_LIBRARIES ${modified_lib}) + endforeach() + + # Replace original variables with the modified lists + set(BLAS_LIBRARIES ${MODIFIED_BLAS_LIBRARIES}) + set(LAPACK_LIBRARIES ${MODIFIED_LAPACK_LIBRARIES}) + endif() if($ENV{VERBOSE}) MESSAGE(STATUS "Benchmark CHOLESKY_GEMM -- BLAS_LIBRARIES : ${BLAS_LIBRARIES}") diff --git a/Benchmark/cholesky_gemm/cholesky-mpi.cpp b/Benchmark/cholesky_gemm/cholesky-mpi.cpp index 203986780f29affb472033ea5017d621dd41b77c..2236bfa43ab3cd11b18ec70cfb2bace903af3603 100644 --- a/Benchmark/cholesky_gemm/cholesky-mpi.cpp +++ b/Benchmark/cholesky_gemm/cholesky-mpi.cpp @@ -71,7 +71,7 @@ auto choleskyFactorization(const int NbLoops, SpBlas::Block blocksInput[], const else{ scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1 , SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler)); #else SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); #endif diff --git a/Benchmark/cholesky_gemm/cholesky.cpp b/Benchmark/cholesky_gemm/cholesky.cpp index 6e85cb174a53327b43766a09401c95f479b6ca43..e514b44ae641cfe38cb81ef9c4505a4acbefe306 100644 --- a/Benchmark/cholesky_gemm/cholesky.cpp +++ b/Benchmark/cholesky_gemm/cholesky.cpp @@ -437,8 +437,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); @@ -509,8 +509,8 @@ int main(int argc, char** argv){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); diff --git a/Benchmark/cholesky_gemm/gemm-mpi.cpp b/Benchmark/cholesky_gemm/gemm-mpi.cpp index d2e01e82b04b012688257ed6c6da964095e9dd3e..8632cc3df2c1b9cf2c379d0a3e4e29f7145572b7 100644 --- a/Benchmark/cholesky_gemm/gemm-mpi.cpp +++ b/Benchmark/cholesky_gemm/gemm-mpi.cpp @@ -67,7 +67,7 @@ auto gemm(const int NbLoops, SpBlas::Block blocksC[], const SpBlas::Block blocks else{ scheduler = std::unique_ptr<SpAbstractScheduler>(new SpMultiPrioScheduler<MaxNbDevices,FavorLocality>(nbGpu*SpCudaUtils::GetDefaultNbStreams())); } - SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(SpUtils::DefaultNumThreads(), nbGpu), std::move(scheduler)); + SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuGpuWorkers(std::max(1, SpUtils::DefaultNumThreads()-nbGpu), nbGpu), std::move(scheduler)); #else SpComputeEngine ce(SpWorkerTeamBuilder::TeamOfCpuWorkers()); #endif diff --git a/Benchmark/cholesky_gemm/gemm.cpp b/Benchmark/cholesky_gemm/gemm.cpp index 49f7b34db0b54797240c18637808922e690385ab..27e9c07b2a559816f7f3406060b56cefbc213202 100644 --- a/Benchmark/cholesky_gemm/gemm.cpp +++ b/Benchmark/cholesky_gemm/gemm.cpp @@ -269,8 +269,8 @@ int main(int argc, char** argv){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); @@ -342,8 +342,8 @@ int main(int argc, char** argv){ for(int BlockSize = MinBlockSize ; BlockSize <= MaxBlockSize ; BlockSize *= 2){ for(int MatrixSize = MinMatrixSize ; MatrixSize <= MaxMatrixSize ; MatrixSize *= 2){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool useLocality = std::get<1>(useMultiprioAndPairs); diff --git a/Benchmark/particles/particles-simu.cpp b/Benchmark/particles/particles-simu.cpp index 94adf1d79692458b03f25d82b917d5422da23b6d..dafb4ecaa6ca4280b10219904bcf5415d3acf46a 100644 --- a/Benchmark/particles/particles-simu.cpp +++ b/Benchmark/particles/particles-simu.cpp @@ -919,8 +919,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); @@ -954,8 +954,8 @@ void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ for(auto useMultiprioAndPairs: schedPairConf){ for(int idxGpu = -1 ; idxGpu <= nbGpus ; ++idxGpu){ for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ - const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + const int nbCpus = std::max(1 , (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads())-nbGpus); const bool useMultiprio = std::get<0>(useMultiprioAndPairs); const bool usePrioPairs = std::get<1>(useMultiprioAndPairs); const bool useLocality = std::get<2>(useMultiprioAndPairs); diff --git a/Benchmark/particles_openmp/CMakeLists.txt b/Benchmark/particles_openmp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6373f41f304cfbfb36d782b93a7840e73f1987d --- /dev/null +++ b/Benchmark/particles_openmp/CMakeLists.txt @@ -0,0 +1,65 @@ +########################################################################### +# SPECX - Berenger Bramas MPCDF - 2016 +# Under LGPL Licence, please you must read the LICENCE file. +########################################################################### +project(BENCHMARK_CP_SPECX_OMP CXX) + +ADD_DEFINITIONS(${SPECX_CXX_FLAGS}) + +if($ENV{VERBOSE}) + MESSAGE(STATUS "Benchmark -- SPECX_CXX_FLAGS : ${BENCHMARK_CP_SPECX_CXX_FLAGS}") +endif() + + +find_package(OpenMP) + +if(OpenMP_FOUND) + list(APPEND BENCHMARK_CP_SPECX_OMP_LIBRARIES OpenMP::OpenMP_CXX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + + # Find all code files + file( + GLOB_RECURSE + source_tests_files + ./*.cpp + ) + + + # Adding the project sources dir as an include dir + INCLUDE_DIRECTORIES( + ${SPECX_BINARY_DIR}/Src + ${SPECX_SOURCE_DIR}/Src + ) + + + # Add execs - 1 cpp = 1 exec + foreach(exec ${source_tests_files}) + if(${SPECX_COMPILE_WITH_MPI} OR NOT ${exec} MATCHES "-mpi\.") + get_filename_component( + execname ${exec} + NAME_WE + ) + + if($ENV{VERBOSE}) + MESSAGE(STATUS "Benchmark -- Add ${execname}") + endif() + + if(SPECX_COMPILE_WITH_CUDA) + set_source_files_properties(${exec} PROPERTIES LANGUAGE CUDA) + endif() + add_executable( + ${execname} + ${exec} + ) + + target_link_libraries( + ${execname} + specx + ${SPECX_LIBRARIES} + ${BENCHMARK_CP_SPECX_OMP_LIBRARIES} + ) + endif() + endforeach(exec) +else(OpenMP_FOUND) + MESSAGE_STATUS(WARNING "Openmp not found, disable particle simulation") +endif(OpenMP_FOUND) diff --git a/Benchmark/particles_openmp/particles-simu-omp.cpp b/Benchmark/particles_openmp/particles-simu-omp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..293f4b457da8a7d02f26f5ab5f84a4c2e09753cd --- /dev/null +++ b/Benchmark/particles_openmp/particles-simu-omp.cpp @@ -0,0 +1,662 @@ +/////////////////////////////////////////////////////////////////////////// +// Specx - Berenger Bramas MPCDF - 2017 +// Under LGPL Licence, please you must read the LICENCE file. +/////////////////////////////////////////////////////////////////////////// +#include <utility> +#include <thread> +#include <chrono> +#include <iostream> + +#include <clsimple.hpp> + +#include "Data/SpDataAccessMode.hpp" +#include "Utils/SpUtils.hpp" +#include "Task/SpTask.hpp" + +#include "Compute/SpComputeEngine.hpp" +#include "Compute/SpWorkerTeamBuilder.hpp" +#include "TaskGraph/SpTaskGraph.hpp" +#include "Config/SpConfig.hpp" +#include "Utils/SpTimer.hpp" + +#include "Scheduler/SpMultiPrioScheduler.hpp" + +#include <omp.h> + + +class ParticlesGroup{ +public: + enum ValueTypes{ + PHYSICAL, + X, + Y, + Z, + FX, + FY, + FZ, + POTENTIAL, + NB_VALUE_TYPES + }; + +private: + std::size_t nbParticles; + std::vector<double> values[NB_VALUE_TYPES]; + +public: + explicit ParticlesGroup(const std::size_t inNbParticles = 0) + : nbParticles(inNbParticles){ + for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){ + values[idxValueType].resize(nbParticles, 0); + } + } + + ParticlesGroup(const ParticlesGroup&) = default; + ParticlesGroup(ParticlesGroup&&) = default; + ParticlesGroup& operator=(const ParticlesGroup&) = default; + ParticlesGroup& operator=(ParticlesGroup&&) = default; + + void setParticleValues(const std::size_t inIdxParticle, + const std::array<double, NB_VALUE_TYPES>& inValues){ + assert(inIdxParticle < nbParticles); + for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){ + values[idxValueType][inIdxParticle] = inValues[idxValueType]; + } + } + + auto getParticle(const std::size_t inIdxParticle) const { + std::array<double, NB_VALUE_TYPES> valuesPart; + for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){ + valuesPart[idxValueType] = values[idxValueType][inIdxParticle]; + } + return valuesPart; + } + + auto getNbParticles() const{ + return nbParticles; + } + + void computeSelf(){ + for(std::size_t idxTarget = 0 ; idxTarget < getNbParticles() ; ++idxTarget){ + const double tx = double(values[X][idxTarget]); + const double ty = double(values[Y][idxTarget]); + const double tz = double(values[Z][idxTarget]); + const double tv = double(values[PHYSICAL][idxTarget]); + + double tfx = double(0.); + double tfy = double(0.); + double tfz = double(0.); + double tpo = double(0.); + + for(std::size_t idxSource = idxTarget+1 ; idxSource < nbParticles ; ++idxSource){ + double dx = tx - double(values[X][idxSource]); + double dy = ty - double(values[Y][idxSource]); + double dz = tz - double(values[Z][idxSource]); + + double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); + const double inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= tv * double(values[PHYSICAL][idxSource]); + + dx *= - inv_square_distance; + dy *= - inv_square_distance; + dz *= - inv_square_distance; + + tfx += dx; + tfy += dy; + tfz += dz; + tpo += inv_distance * double(values[PHYSICAL][idxSource]); + + values[FX][idxSource] -= dx; + values[FY][idxSource] -= dy; + values[FZ][idxSource] -= dz; + values[POTENTIAL][idxSource] += inv_distance * tv; + } + + values[FX][idxTarget] += tfx; + values[FY][idxTarget] += tfy; + values[FZ][idxTarget] += tfz; + values[POTENTIAL][idxTarget] += tpo; + } + } + + void compute(ParticlesGroup& inOther){ + for(std::size_t idxTarget = 0 ; idxTarget < inOther.getNbParticles() ; ++idxTarget){ + const double tx = double(inOther.values[X][idxTarget]); + const double ty = double(inOther.values[Y][idxTarget]); + const double tz = double(inOther.values[Z][idxTarget]); + const double tv = double(inOther.values[PHYSICAL][idxTarget]); + + double tfx = double(0.); + double tfy = double(0.); + double tfz = double(0.); + double tpo = double(0.); + + for(std::size_t idxSource = 0 ; idxSource < nbParticles ; ++idxSource){ + double dx = tx - double(values[X][idxSource]); + double dy = ty - double(values[Y][idxSource]); + double dz = tz - double(values[Z][idxSource]); + + double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); + const double inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= tv * double(values[PHYSICAL][idxSource]); + + dx *= - inv_square_distance; + dy *= - inv_square_distance; + dz *= - inv_square_distance; + + tfx += dx; + tfy += dy; + tfz += dz; + tpo += inv_distance * double(values[PHYSICAL][idxSource]); + + values[FX][idxSource] -= dx; + values[FY][idxSource] -= dy; + values[FZ][idxSource] -= dz; + values[POTENTIAL][idxSource] += inv_distance * tv; + } + + inOther.values[FX][idxTarget] += tfx; + inOther.values[FY][idxTarget] += tfy; + inOther.values[FZ][idxTarget] += tfz; + inOther.values[POTENTIAL][idxTarget] += tpo; + } + } + + ///////////////////////////////////////////////////////////// + + #ifdef SPECX_COMPILE_WITH_CUDA + struct DataDescr { + std::size_t nbParticles; + double* values; + + double* getRawPtr() const{ + return values; + } + + size_t getRawSize() const{ + return nbParticles*sizeof(float)*NB_VALUE_TYPES; + } + }; + + auto memmovHostToDevice(cudaStream_t stream){ + double* doubleDevicePtr; + CUDA_ASSERT(cudaMalloc(&doubleDevicePtr, nbParticles*NB_VALUE_TYPES*sizeof(double))); + for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){ + CUDA_ASSERT(cudaMemcpyAsync(&doubleDevicePtr[idxValueType*nbParticles], values[idxValueType].data(), nbParticles*sizeof(double), cudaMemcpyHostToDevice, stream)); + } + return DataDescr{nbParticles, doubleDevicePtr}; + } + + void memmovDeviceToHost(DataDescr& inDescr, cudaStream_t stream){ + double* doubleDevicePtr = inDescr.values; + for(std::size_t idxValueType = 0 ; idxValueType < NB_VALUE_TYPES ; ++idxValueType){ + CUDA_ASSERT(cudaMemcpyAsync(values[idxValueType].data(), &doubleDevicePtr[idxValueType*nbParticles], nbParticles*sizeof(double), cudaMemcpyDeviceToHost, stream)); + } + CUDA_ASSERT(cudaFreeAsync(doubleDevicePtr, stream)); + } + #endif +}; + +#if defined(SPECX_COMPILE_WITH_CUDA) || defined(SPECX_COMPILE_WITH_HIP) +template <class T> +__device__ T CuMin(const T& v1, const T& v2){ + return v1 < v2 ? v1 : v2; +} + +__global__ void p2p_inner_gpu(void* data, std::size_t size){ + const std::size_t nbParticles = size/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES; + double* values[ParticlesGroup::NB_VALUE_TYPES]; + for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ + values[idxValueType] = reinterpret_cast<double*>(data)+idxValueType*nbParticles; + } + + constexpr std::size_t SHARED_MEMORY_SIZE = 128; + const std::size_t nbThreads = blockDim.x*gridDim.x; + const std::size_t uniqueId = threadIdx.x + blockIdx.x*blockDim.x; + const std::size_t nbIterations = ((nbParticles+nbThreads-1)/nbThreads)*nbThreads; + + for(std::size_t idxTarget = uniqueId ; idxTarget < nbIterations ; idxTarget += nbThreads){ + const bool threadCompute = (idxTarget<nbParticles); + + double tx; + double ty; + double tz; + double tv; + + if(threadCompute){ + tx = double(values[ParticlesGroup::X][idxTarget]); + ty = double(values[ParticlesGroup::Y][idxTarget]); + tz = double(values[ParticlesGroup::Z][idxTarget]); + tv = double(values[ParticlesGroup::PHYSICAL][idxTarget]); + } + + double tfx = double(0.); + double tfy = double(0.); + double tfz = double(0.); + double tpo = double(0.); + + for(std::size_t idxCopy = 0 ; idxCopy < nbParticles ; idxCopy += SHARED_MEMORY_SIZE){ + __shared__ double sourcesX[SHARED_MEMORY_SIZE]; + __shared__ double sourcesY[SHARED_MEMORY_SIZE]; + __shared__ double sourcesZ[SHARED_MEMORY_SIZE]; + __shared__ double sourcesPhys[SHARED_MEMORY_SIZE]; + + const std::size_t nbCopies = CuMin(SHARED_MEMORY_SIZE, nbParticles-idxCopy); + for(std::size_t idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){ + sourcesX[idx] = values[ParticlesGroup::X][idx+idxCopy]; + sourcesY[idx] = values[ParticlesGroup::Y][idx+idxCopy]; + sourcesZ[idx] = values[ParticlesGroup::Z][idx+idxCopy]; + sourcesPhys[idx] = values[ParticlesGroup::PHYSICAL][idx+idxCopy]; + } + + __syncthreads(); + + if(threadCompute){ + for(std::size_t otherIndex = 0; otherIndex < nbCopies; ++otherIndex) { + if(idxCopy + otherIndex != idxTarget){ + double dx = tx - sourcesX[otherIndex]; + double dy = ty - sourcesY[otherIndex]; + double dz = tz - sourcesZ[otherIndex]; + + double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); + const double inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= tv * sourcesPhys[otherIndex]; + + dx *= - inv_square_distance; + dy *= - inv_square_distance; + dz *= - inv_square_distance; + + tfx += dx; + tfy += dy; + tfz += dz; + tpo += inv_distance * sourcesPhys[otherIndex]; + } + } + } + + __syncthreads(); + } + + if( threadCompute ){ + values[ParticlesGroup::FX][idxTarget] += tfx; + values[ParticlesGroup::FY][idxTarget] += tfy; + values[ParticlesGroup::FZ][idxTarget] += tfz; + values[ParticlesGroup::POTENTIAL][idxTarget] += tpo; + + } + + __syncthreads(); + } +} + +__global__ void p2p_neigh_gpu(const void* dataSrc, std::size_t sizeSrc, + void* dataTgt, std::size_t sizeTgt){ + const std::size_t nbParticlesTgt = sizeTgt/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES; + double* valuesTgt[ParticlesGroup::NB_VALUE_TYPES]; + for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ + valuesTgt[idxValueType] = reinterpret_cast<double*>(dataTgt)+idxValueType*nbParticlesTgt; + } + + const std::size_t nbParticlesSrc = sizeSrc/sizeof(double)/ParticlesGroup::NB_VALUE_TYPES; + const double* valuesSrc[ParticlesGroup::NB_VALUE_TYPES]; + for(std::size_t idxValueType = 0 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ + valuesSrc[idxValueType] = reinterpret_cast<const double*>(dataSrc)+idxValueType*nbParticlesSrc; + } + + constexpr std::size_t SHARED_MEMORY_SIZE = 128; + const std::size_t nbThreads = blockDim.x*gridDim.x; + const std::size_t uniqueId = threadIdx.x + blockIdx.x*blockDim.x; + const std::size_t nbIterations = ((nbParticlesTgt+nbThreads-1)/nbThreads)*nbThreads; + + for(std::size_t idxTarget = uniqueId ; idxTarget < nbIterations ; idxTarget += nbThreads){ + const bool threadCompute = (idxTarget<nbParticlesTgt); + + double tx; + double ty; + double tz; + double tv; + + if(threadCompute){ + tx = double(valuesTgt[ParticlesGroup::X][idxTarget]); + ty = double(valuesTgt[ParticlesGroup::Y][idxTarget]); + tz = double(valuesTgt[ParticlesGroup::Z][idxTarget]); + tv = double(valuesTgt[ParticlesGroup::PHYSICAL][idxTarget]); + } + + double tfx = double(0.); + double tfy = double(0.); + double tfz = double(0.); + double tpo = double(0.); + + for(std::size_t idxCopy = 0 ; idxCopy < nbParticlesSrc ; idxCopy += SHARED_MEMORY_SIZE){ + __shared__ double sourcesX[SHARED_MEMORY_SIZE]; + __shared__ double sourcesY[SHARED_MEMORY_SIZE]; + __shared__ double sourcesZ[SHARED_MEMORY_SIZE]; + __shared__ double sourcesPhys[SHARED_MEMORY_SIZE]; + + const std::size_t nbCopies = CuMin(SHARED_MEMORY_SIZE, nbParticlesSrc-idxCopy); + for(std::size_t idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){ + sourcesX[idx] = valuesSrc[ParticlesGroup::X][idx+idxCopy]; + sourcesY[idx] = valuesSrc[ParticlesGroup::Y][idx+idxCopy]; + sourcesZ[idx] = valuesSrc[ParticlesGroup::Z][idx+idxCopy]; + sourcesPhys[idx] = valuesSrc[ParticlesGroup::PHYSICAL][idx+idxCopy]; + } + + __syncthreads(); + + if(threadCompute){ + for(std::size_t otherIndex = 0; otherIndex < nbCopies; ++otherIndex) { + double dx = tx - sourcesX[otherIndex]; + double dy = ty - sourcesY[otherIndex]; + double dz = tz - sourcesZ[otherIndex]; + + double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); + const double inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= tv * sourcesPhys[otherIndex]; + + dx *= - inv_square_distance; + dy *= - inv_square_distance; + dz *= - inv_square_distance; + + tfx += dx; + tfy += dy; + tfz += dz; + tpo += inv_distance * sourcesPhys[otherIndex]; + } + } + + __syncthreads(); + } + + if( threadCompute ){ + valuesTgt[ParticlesGroup::FX][idxTarget] += tfx; + valuesTgt[ParticlesGroup::FY][idxTarget] += tfy; + valuesTgt[ParticlesGroup::FZ][idxTarget] += tfz; + valuesTgt[ParticlesGroup::POTENTIAL][idxTarget] += tpo; + } + + __syncthreads(); + } +} + +#endif + + +#include <random> + +template <class NumType = double> +void FillRandomValues(ParticlesGroup& inGroup, const int inSeed, + const NumType inPhysicalValue = 0.1, + const NumType inMinPos = 0, const NumType inMaxPos = 1){ + std::mt19937 gen(inSeed); + std::uniform_real_distribution<> distrib(inMinPos, inMaxPos); + + std::array<double, ParticlesGroup::NB_VALUE_TYPES> vecPart; + std::fill(vecPart.begin(), vecPart.end(), 0); + vecPart[ParticlesGroup::PHYSICAL] = inPhysicalValue; + + for(std::size_t idxPart = 0 ; idxPart < inGroup.getNbParticles() ; ++idxPart){ + vecPart[ParticlesGroup::X] = distrib(gen); + vecPart[ParticlesGroup::Y] = distrib(gen); + vecPart[ParticlesGroup::Z] = distrib(gen); + inGroup.setParticleValues(idxPart, vecPart); + } +} + +template <class ValueType = double> +ValueType ChechAccuracy(const ParticlesGroup& inGroup1, const ParticlesGroup& inGroup2){ + if(inGroup1.getNbParticles() != inGroup2.getNbParticles()){ + return std::numeric_limits<ValueType>::infinity(); + } + + ValueType maxDiff = 0; + for(std::size_t idx = 0 ; idx < inGroup1.getNbParticles() ; ++idx){ + const auto p1 = inGroup1.getParticle(idx); + const auto p2 = inGroup2.getParticle(idx); + + for(std::size_t idxVal = 0 ; idxVal < ParticlesGroup::NB_VALUE_TYPES ; ++idxVal){ + maxDiff = std::max(maxDiff, std::abs((p1[idxVal] - p2[idxVal])/(p1[idxVal] == 0? 1 : p1[idxVal]))); + } + } + return maxDiff; +} + + +struct TuneResult{ + int nbThreadsInner = 0; + int nbBlocksInner = 0; + int nbThreadsOuter = 0; + int nbBlocksOuter = 0; +}; + + +template <int MaxNbDevices, const bool FavorLocality> +auto GetPriority(const bool usePrioPairs, const int maxInteractions, const int minInteractions, const int currentInteractions){ + if(usePrioPairs){ + const int ratio = 5 * double(currentInteractions-minInteractions)/double(maxInteractions-minInteractions); + const int invRatio = 5 - ratio; + return SpPriority(SpMultiPrioScheduler<MaxNbDevices,FavorLocality>::GeneratePriorityWorkerPair(invRatio, ratio)); + } + return SpPriority(0); +} + + +bool isCpuWorker; +#pragma omp threadprivate(isCpuWorker) + + +#ifdef SPECX_COMPILE_WITH_CUDA +// Cuda stream +cudaStream_t threadStream; +#pragma omp threadprivate(threadStream) +#endif + +template <int MaxNbDevices, const bool FavorLocality> +auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPartsPerGroup, + const int NbGroups, const int nbCpu, const int nbGpu, const TuneResult& inKernelConfig, + const int maxInteractions, const int minInteractions){ + + std::vector<ParticlesGroup> particleGroups(NbGroups); + + { + std::random_device rd; + std::uniform_int_distribution<int> dist(MinPartsPerGroup, MaxPartsPerGroup); + for(auto& group : particleGroups){ + const int nbParts = dist(rd); + group = ParticlesGroup(nbParts); + } + } + + std::vector<double> minMaxAvg(3); + minMaxAvg[0] = std::numeric_limits<double>::max(); + minMaxAvg[1] = std::numeric_limits<double>::min(); + minMaxAvg[2] = 0; + + for(int idxLoop = 0 ; idxLoop < NbLoops ; ++idxLoop){ + #pragma omp parallel + { + isCpuWorker = true; + #ifdef SPECX_COMPILE_WITH_CUDA + if(nbCpu <= omp_get_thread_num() ){ + isCpuWorker = false; + const int deviceId = (omp_get_thread_num() - nbCpu)/SpCudaUtils::GetNbDevices(); + CUDA_ASSERT(cudaSetDevice(deviceId)); + CUDA_ASSERT(cudaStreamCreate(&threadStream)); + } + #endif + + #pragma omp master + { + SpTimer timer; + #pragma omp taskgroup + { + + for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){ + auto& group1 = particleGroups[idxGroup1]; + const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group1.getNbParticles()).getPriority(); + #pragma omp task depend(mutexinout:group1) firstprivate(group1) priority(priority) default(none) + { + if(isCpuWorker){ + group1.computeSelf(); + } + #ifdef SPECX_COMPILE_WITH_CUDA + else{ + auto descr = group1.memmovHostToDevice(threadStream); + p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,threadStream>>> + (descr.getRawPtr(), descr.getRawSize()); + group1.memmovDeviceToHost(descr, threadStream); + CUDA_ASSERT(cudaStreamSynchronize(threadStream)); + } + #endif + } + } + + for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){ + auto& group1 = particleGroups[idxGroup1]; + + const int idxGroup2 = (idxGroup1+1)%particleGroups.size(); + auto& group2 = particleGroups[idxGroup2]; + + const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group2.getNbParticles()).getPriority(); + + #pragma omp task depend(mutexinout:group1, group2) firstprivate(group1, group2) priority(priority) default(none) + { + if(isCpuWorker){ + group1.compute(group2); + } + #ifdef SPECX_COMPILE_WITH_CUDA + else{ + auto descr1 = group1.memmovHostToDevice(threadStream); + auto descr2 = group2.memmovHostToDevice(threadStream); + p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>> + (descr2.getRawPtr(), descr2.getRawSize(), descr1.getRawPtr(), descr1.getRawSize()); + p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>> + (descr1.getRawPtr(), descr1.getRawSize(), descr2.getRawPtr(), descr2.getRawSize()); + group1.memmovDeviceToHost(descr1, threadStream); + group2.memmovDeviceToHost(descr2, threadStream); + CUDA_ASSERT(cudaStreamSynchronize(threadStream)); + } + #endif + } + } + } + + timer.stop(); + + minMaxAvg[0] = std::min(minMaxAvg[0], timer.getElapsed()); + minMaxAvg[1] = std::max(minMaxAvg[1], timer.getElapsed()); + minMaxAvg[2] += timer.getElapsed(); + } + + #pragma omp barrier + + #ifdef SPECX_COMPILE_WITH_CUDA + if(isCpuWorker == false){ + CUDA_ASSERT(cudaStreamDestroy(threadStream)); + } + #endif + } + } + + minMaxAvg[2] /= NbLoops; + + return minMaxAvg; +} + +void BenchmarkTest(int argc, char** argv, const TuneResult& inKernelConfig){ + CLsimple args("Particles", argc, argv); + + args.addParameterNoArg({"help"}, "help"); + + int NbLoops = 100; + args.addParameter<int>({"lp" ,"nbloops"}, "NbLoops", NbLoops, 100); + + int MinPartsPerGroup; + args.addParameter<int>({"minp"}, "MinPartsPerGroup", MinPartsPerGroup, 100); + + int MaxPartsPerGroup; + args.addParameter<int>({"maxp"}, "MaxPartsPerGroup", MaxPartsPerGroup, 300); + + int MinNbGroups; + args.addParameter<int>({"minnbgroups"}, "MinNbGroups", MinNbGroups, 10); + + int MaxNbGroups; + args.addParameter<int>({"maxnbgroups"}, "MaxNbGroups", MaxNbGroups, 10); + + std::string outputDir = "./"; + args.addParameter<std::string>({"od"}, "outputdir", outputDir, outputDir); + + args.parse(); + + if(!args.isValid() || args.hasKey("help")){ + // Print the help + args.printHelp(std::cout); + return; + } + + const int maxInteractions = MaxPartsPerGroup*MaxPartsPerGroup; + const int minInteractions = MinPartsPerGroup*MinPartsPerGroup; + + assert(MinPartsPerGroup <= MaxPartsPerGroup); + assert(MinNbGroups <= MaxNbGroups); + +#ifdef SPECX_COMPILE_WITH_CUDA + SpCudaUtils::PrintInfo(); + const int nbMaxGpus = SpCudaUtils::GetNbDevices(); +#else + const int nbMaxGpus = 0; +#endif + + std::vector<std::vector<double>> allDurations; + + for(int idxGpu = -1 ; idxGpu <= nbMaxGpus ; ++idxGpu){ + for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + + std::cout << "NbGpu = " << idxGpu << " BlockSize = " << idxBlock << std::endl; + const auto minMaxAvg = BenchCore<8,true>(NbLoops, MinPartsPerGroup, + MaxPartsPerGroup, idxBlock, nbCpus, nbGpus, inKernelConfig, + maxInteractions, minInteractions); + allDurations.push_back(minMaxAvg); + std::cout << " - Min = " << minMaxAvg[0] << " Max = " << minMaxAvg[1] << " Avg = " << minMaxAvg[2] << std::endl; + } + } + + std::ofstream file(outputDir + "/particle-simu-omp.csv"); + if(!file.is_open()){ + std::cerr << "Cannot open file " << outputDir + "/particle-simu-omp.csv" << std::endl; + return; + } + + file << "NbCpu,NbGpu,BlockSize,MinDuration,MaxDuration,AvgDuration" << std::endl; + int idxDuration = 0; + for(int idxGpu = -1 ; idxGpu <= nbMaxGpus ; ++idxGpu){ + for(int idxBlock = MinNbGroups ; idxBlock <= MaxNbGroups ; idxBlock *= 2){ + const int nbCpus = (idxGpu == -1 ? 1 : SpUtils::DefaultNumThreads()); + const int nbGpus = (idxGpu == -1 ? 0 : idxGpu); + + file << nbCpus << "," << nbGpus << "," << idxBlock << "," + << allDurations[idxDuration][0] << "," + << allDurations[idxDuration][1] << "," + << allDurations[idxDuration][2] << std::endl; + idxDuration += 1; + } + } +} + + +int main(int argc, char** argv){ + + TuneResult tuneConfig{256, 128, 256, 128}; + BenchmarkTest(argc, argv, tuneConfig); + + return 0; +} diff --git a/Src/Compute/SpWorkerTeamBuilder.hpp b/Src/Compute/SpWorkerTeamBuilder.hpp index 9c02b7d60401a189cf06bd7ed60fd5417159de1f..833e618675b2ab97a714e61d91e677b2a0dc3d67 100644 --- a/Src/Compute/SpWorkerTeamBuilder.hpp +++ b/Src/Compute/SpWorkerTeamBuilder.hpp @@ -42,9 +42,11 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfCudaWorkers(const int nbWor return res; } -static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuCudaWorkers(const int nbCpuWorkersInit = -1, int nbCudaWorkers = SpCudaUtils::GetNbDevices(), const int nbWorkerPerCudas = SpCudaUtils::GetDefaultNbStreams()) { + const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbCudaWorkers)); + if(SpCudaUtils::GetNbDevices() < nbCudaWorkers){ std::cout << "[SPECX] The number of devices asked (" << nbCudaWorkers << ") is above the real number of devices (" @@ -76,7 +78,7 @@ static auto TeamOfCpuGpuWorkers(Args&& ... args) { } #endif #ifdef SPECX_COMPILE_WITH_HIP -static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWorkerPerHipsInit = SpHipUtils::GetDefaultNbStreams(), int nbHipWorkers = SpHipUtils::GetNbDevices()) { if(SpHipUtils::GetNbDevices() < nbHipWorkers){ std::cout << "[SPECX] The number of devices asked (" @@ -99,9 +101,10 @@ static small_vector<std::unique_ptr<SpWorker>> TeamOfHipWorkers(const int nbWork return res; } -static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = SpUtils::DefaultNumThreads(), +static small_vector<std::unique_ptr<SpWorker>> TeamOfCpuHipWorkers(const int nbCpuWorkers = -1, const int nbWorkerPerHips = SpHipUtils::GetDefaultNbStreams(), int nbHipWorkers = SpHipUtils::GetNbDevices()) { + const int nbCpuWorkers = (nbCpuWorkersInit != -1 ? nbCpuWorkersInit : std::max(1, SpUtils::DefaultNumThreads()-nbWorkerPerHips)); if(SpHipUtils::GetNbDevices() < nbHipWorkers){ std::cout << "[SPECX] The number of devices asked (" << nbHipWorkers << ") is above the real number of devices ("