From fdef13f87f00d6c4729f691f7bc7c03a8864a947 Mon Sep 17 00:00:00 2001 From: Berenger Bramas <Berenger.Bramas@inria.fr> Date: Sat, 15 Mar 2025 17:39:48 +0100 Subject: [PATCH] wip --- Benchmark/particles_openmp/CMakeLists.txt | 2 +- Benchmark/particles_starpu/CMakeLists.txt | 8 +- .../particles-simu-starpu.cpp | 167 +++++++++--------- 3 files changed, 89 insertions(+), 88 deletions(-) diff --git a/Benchmark/particles_openmp/CMakeLists.txt b/Benchmark/particles_openmp/CMakeLists.txt index c6373f4..41bbe8a 100644 --- a/Benchmark/particles_openmp/CMakeLists.txt +++ b/Benchmark/particles_openmp/CMakeLists.txt @@ -61,5 +61,5 @@ if(OpenMP_FOUND) endif() endforeach(exec) else(OpenMP_FOUND) - MESSAGE_STATUS(WARNING "Openmp not found, disable particle simulation") + MESSAGE(WARNING "Openmp not found, disable particle simulation") endif(OpenMP_FOUND) diff --git a/Benchmark/particles_starpu/CMakeLists.txt b/Benchmark/particles_starpu/CMakeLists.txt index b082b5b..d840db7 100644 --- a/Benchmark/particles_starpu/CMakeLists.txt +++ b/Benchmark/particles_starpu/CMakeLists.txt @@ -20,7 +20,7 @@ if(DEFINED ENV{STARPU_INC_DIR} AND DEFINED ENV{STARPU_LIB_DIR}) link_directories(${STARPU_LIB_DIR}) # Ajout des bibliothèques StarPU - list(APPEND BENCHMARK_CP_SPECX_STARPU_LIBRARIES starpu) + list(APPEND BENCHMARK_CP_SPECX_STARPU_LIBRARIES starpu-1.4) # Find all code files file( @@ -65,6 +65,6 @@ if(DEFINED ENV{STARPU_INC_DIR} AND DEFINED ENV{STARPU_LIB_DIR}) ) endif() endforeach(exec) -else(OpenMP_FOUND) - MESSAGE_STATUS(WARNING "Openmp not found, disable particle simulation") -endif(OpenMP_FOUND) +else(DEFINED ENV{STARPU_INC_DIR} AND DEFINED ENV{STARPU_LIB_DIR}) + MESSAGE(WARNING "ENV{STARPU_INC_DIR} AND ENV{STARPU_LIB_DIR} undefined") +endif(DEFINED ENV{STARPU_INC_DIR} AND DEFINED ENV{STARPU_LIB_DIR}) diff --git a/Benchmark/particles_starpu/particles-simu-starpu.cpp b/Benchmark/particles_starpu/particles-simu-starpu.cpp index 941975e..28e5645 100644 --- a/Benchmark/particles_starpu/particles-simu-starpu.cpp +++ b/Benchmark/particles_starpu/particles-simu-starpu.cpp @@ -23,7 +23,7 @@ #include <starpu.h> #ifdef STARPU_CUDA -#include <starpu/cuda.h> +#include <starpu_cuda.h> #endif @@ -91,12 +91,12 @@ public: }; -void computeSelf(double* values[NB_VALUE_TYPES], const std::size_t nbParticles){ - for(std::size_t idxTarget = 0 ; idxTarget < getNbParticles() ; ++idxTarget){ - const double tx = double(values[X][idxTarget]); - const double ty = double(values[Y][idxTarget]); - const double tz = double(values[Z][idxTarget]); - const double tv = double(values[PHYSICAL][idxTarget]); +void computeSelf(double* values[ParticlesGroup::NB_VALUE_TYPES], const std::size_t nbParticles){ + for(std::size_t idxTarget = 0 ; idxTarget < nbParticles ; ++idxTarget){ + const double tx = double(values[ParticlesGroup::X][idxTarget]); + const double ty = double(values[ParticlesGroup::Y][idxTarget]); + const double tz = double(values[ParticlesGroup::Z][idxTarget]); + const double tv = double(values[ParticlesGroup::PHYSICAL][idxTarget]); double tfx = double(0.); double tfy = double(0.); @@ -104,15 +104,15 @@ void computeSelf(double* values[NB_VALUE_TYPES], const std::size_t nbParticles){ double tpo = double(0.); for(std::size_t idxSource = idxTarget+1 ; idxSource < nbParticles ; ++idxSource){ - double dx = tx - double(values[X][idxSource]); - double dy = ty - double(values[Y][idxSource]); - double dz = tz - double(values[Z][idxSource]); + double dx = tx - double(values[ParticlesGroup::X][idxSource]); + double dy = ty - double(values[ParticlesGroup::Y][idxSource]); + double dz = tz - double(values[ParticlesGroup::Z][idxSource]); double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); const double inv_distance = sqrt(inv_square_distance); inv_square_distance *= inv_distance; - inv_square_distance *= tv * double(values[PHYSICAL][idxSource]); + inv_square_distance *= tv * double(values[ParticlesGroup::PHYSICAL][idxSource]); dx *= - inv_square_distance; dy *= - inv_square_distance; @@ -121,28 +121,28 @@ void computeSelf(double* values[NB_VALUE_TYPES], const std::size_t nbParticles){ tfx += dx; tfy += dy; tfz += dz; - tpo += inv_distance * double(values[PHYSICAL][idxSource]); + tpo += inv_distance * double(values[ParticlesGroup::PHYSICAL][idxSource]); - values[FX][idxSource] -= dx; - values[FY][idxSource] -= dy; - values[FZ][idxSource] -= dz; - values[POTENTIAL][idxSource] += inv_distance * tv; + values[ParticlesGroup::FX][idxSource] -= dx; + values[ParticlesGroup::FY][idxSource] -= dy; + values[ParticlesGroup::FZ][idxSource] -= dz; + values[ParticlesGroup::POTENTIAL][idxSource] += inv_distance * tv; } - values[FX][idxTarget] += tfx; - values[FY][idxTarget] += tfy; - values[FZ][idxTarget] += tfz; - values[POTENTIAL][idxTarget] += tpo; + values[ParticlesGroup::FX][idxTarget] += tfx; + values[ParticlesGroup::FY][idxTarget] += tfy; + values[ParticlesGroup::FZ][idxTarget] += tfz; + values[ParticlesGroup::POTENTIAL][idxTarget] += tpo; } } -void compute(double* values[NB_VALUE_TYPES], const std::size_t nbParticles - double* inOther_values[NB_VALUE_TYPES], const std::size_t inOther_nbParticles){ +void compute(double* values[ParticlesGroup::NB_VALUE_TYPES], const std::size_t nbParticles, + double* inOther_values[ParticlesGroup::NB_VALUE_TYPES], const std::size_t inOther_nbParticles){ for(std::size_t idxTarget = 0 ; idxTarget < inOther_nbParticles ; ++idxTarget){ - const double tx = double(inOther_values[X][idxTarget]); - const double ty = double(inOther_values[Y][idxTarget]); - const double tz = double(inOther_values[Z][idxTarget]); - const double tv = double(inOther_values[PHYSICAL][idxTarget]); + const double tx = double(inOther_values[ParticlesGroup::X][idxTarget]); + const double ty = double(inOther_values[ParticlesGroup::Y][idxTarget]); + const double tz = double(inOther_values[ParticlesGroup::Z][idxTarget]); + const double tv = double(inOther_values[ParticlesGroup::PHYSICAL][idxTarget]); double tfx = double(0.); double tfy = double(0.); @@ -150,15 +150,15 @@ void compute(double* values[NB_VALUE_TYPES], const std::size_t nbParticles double tpo = double(0.); for(std::size_t idxSource = 0 ; idxSource < nbParticles ; ++idxSource){ - double dx = tx - double(values[X][idxSource]); - double dy = ty - double(values[Y][idxSource]); - double dz = tz - double(values[Z][idxSource]); + double dx = tx - double(values[ParticlesGroup::X][idxSource]); + double dy = ty - double(values[ParticlesGroup::Y][idxSource]); + double dz = tz - double(values[ParticlesGroup::Z][idxSource]); double inv_square_distance = double(1) / (dx*dx + dy*dy + dz*dz); const double inv_distance = sqrt(inv_square_distance); inv_square_distance *= inv_distance; - inv_square_distance *= tv * double(values[PHYSICAL][idxSource]); + inv_square_distance *= tv * double(values[ParticlesGroup::PHYSICAL][idxSource]); dx *= - inv_square_distance; dy *= - inv_square_distance; @@ -167,18 +167,18 @@ void compute(double* values[NB_VALUE_TYPES], const std::size_t nbParticles tfx += dx; tfy += dy; tfz += dz; - tpo += inv_distance * double(values[PHYSICAL][idxSource]); + tpo += inv_distance * double(values[ParticlesGroup::PHYSICAL][idxSource]); - values[FX][idxSource] -= dx; - values[FY][idxSource] -= dy; - values[FZ][idxSource] -= dz; - values[POTENTIAL][idxSource] += inv_distance * tv; + values[ParticlesGroup::FX][idxSource] -= dx; + values[ParticlesGroup::FY][idxSource] -= dy; + values[ParticlesGroup::FZ][idxSource] -= dz; + values[ParticlesGroup::POTENTIAL][idxSource] += inv_distance * tv; } - inOther_values[FX][idxTarget] += tfx; - inOther_values[FY][idxTarget] += tfy; - inOther_values[FZ][idxTarget] += tfz; - inOther_values[POTENTIAL][idxTarget] += tpo; + inOther_values[ParticlesGroup::FX][idxTarget] += tfx; + inOther_values[ParticlesGroup::FY][idxTarget] += tfy; + inOther_values[ParticlesGroup::FZ][idxTarget] += tfz; + inOther_values[ParticlesGroup::POTENTIAL][idxTarget] += tpo; } } @@ -371,12 +371,14 @@ __global__ void p2p_neigh_gpu(const void* dataSrc, std::size_t sizeSrc, #endif +const int NbThreadBlocks = 256; +const int NbThreadsPerBlock = 256; static void p2p_inner_cpu_starpu_starpu(void *buffers[], void *cl_arg) { double* values[ParticlesGroup::NB_VALUE_TYPES]; - values[0] = reinterpret_cast<double*>(STARPU_BLOCK_GET_PTR(buffers[0])); - const std::size_t nbParticles = STARPU_BLOCK_INTERFACE_ID(buffers[0]); + values[0] = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[0])); + const std::size_t nbParticles = STARPU_VECTOR_GET_NX(buffers[0]); for(std::size_t idxValueType = 1 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ values[idxValueType] = values[idxValueType-1] + nbParticles; @@ -388,25 +390,49 @@ static void p2p_inner_cpu_starpu_starpu(void *buffers[], void *cl_arg) #ifdef SPECX_COMPILE_WITH_CUDA static void p2p_inner_gpu_starpu(void *buffers[], void *cl_arg) { - double* values = reinterpret_cast<double*>(STARPU_BLOCK_GET_PTR(buffers[0])); - const std::size_t nbParticles = STARPU_BLOCK_INTERFACE_ID(buffers[0]); - p2p_inner_gpu(values, nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES); + double* values = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[0])); + const std::size_t nbParticles = STARPU_VECTOR_GET_NX(buffers[0]); + p2p_inner_gpu<<<NbThreadBlocks, NbThreadsPerBlock, 0, starpu_cuda_get_local_stream()>>>(values, nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES); + cudaStreamSynchronize(starpu_cuda_get_local_stream()); } #endif static void p2p_neigh_cpu_starpu(void *buffers[], void *cl_arg) { + double* values[ParticlesGroup::NB_VALUE_TYPES]; + values[0] = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[0])); + const std::size_t nbParticles = STARPU_VECTOR_GET_NX(buffers[0]); + + for(std::size_t idxValueType = 1 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ + values[idxValueType] = values[idxValueType-1] + nbParticles; + } + + double* other_values[ParticlesGroup::NB_VALUE_TYPES]; + other_values[0] = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[1])); + const std::size_t other_nbParticles = STARPU_VECTOR_GET_NX(buffers[1]); + + for(std::size_t idxValueType = 1 ; idxValueType < ParticlesGroup::NB_VALUE_TYPES ; ++idxValueType){ + other_values[idxValueType] = other_values[idxValueType-1] + other_nbParticles; + } + + compute(values, nbParticles, other_values, other_nbParticles); + compute(other_values, other_nbParticles, values, nbParticles); } #ifdef SPECX_COMPILE_WITH_CUDA static void p2p_neigh_gpu_starpu(void *buffers[], void *cl_arg) { - double* values = reinterpret_cast<double*>(STARPU_BLOCK_GET_PTR(buffers[0])); - const std::size_t nbParticles = STARPU_BLOCK_INTERFACE_ID(buffers[0]); + double* values = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[0])); + const std::size_t nbParticles = STARPU_VECTOR_GET_NX(buffers[0]); - double* other_values = reinterpret_cast<double*>(STARPU_BLOCK_GET_PTR(buffers[1])); - const std::size_t other_nbParticles = STARPU_BLOCK_INTERFACE_ID(buffers[1]); + double* other_values = reinterpret_cast<double*>(STARPU_VECTOR_GET_PTR(buffers[1])); + const std::size_t other_nbParticles = STARPU_VECTOR_GET_NX(buffers[1]); + p2p_neigh_gpu<<<NbThreadBlocks, NbThreadsPerBlock, 0, starpu_cuda_get_local_stream()>>>(values, nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES, + other_values, other_nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES); + p2p_neigh_gpu<<<NbThreadBlocks, NbThreadsPerBlock, 0, starpu_cuda_get_local_stream()>>>(other_values, other_nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES, + values, nbParticles*sizeof(double)*ParticlesGroup::NB_VALUE_TYPES); + cudaStreamSynchronize(starpu_cuda_get_local_stream()); } #endif @@ -528,26 +554,19 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart for(int idxLoop = 0 ; idxLoop < NbLoops ; ++idxLoop){ starpu_init(NULL); + std::vector<starpu_data_handle_t> handles; + for(auto& group : particleGroups){ + starpu_data_handle_t handle; + starpu_vector_data_register(&handle, 0, reinterpret_cast<uintptr_t>(group.getRawPtr()), group.getRawSize(), 1); + handles.push_back(handle); + } + SpTimer timer; for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){ auto& group1 = particleGroups[idxGroup1]; const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group1.getNbParticles()).getPriority(); - #pragma omp task depend(mutexinout:group1) firstprivate(group1) priority(priority) default(none) - { - if(isCpuWorker){ - group1.computeSelf(); - } - #ifdef SPECX_COMPILE_WITH_CUDA - else{ - auto descr = group1.memmovHostToDevice(threadStream); - p2p_inner_gpu<<<inKernelConfig.nbBlocksInner,inKernelConfig.nbThreadsInner,0,threadStream>>> - (descr.getRawPtr(), descr.getRawSize()); - group1.memmovDeviceToHost(descr, threadStream); - CUDA_ASSERT(cudaStreamSynchronize(threadStream)); - } - #endif - } + starpu_insert_task(&cl_one_data, STARPU_PRIORITY, priority, STARPU_RW, handles[idxGroup1], 0); } for(int idxGroup1 = 0 ; idxGroup1 < int(particleGroups.size()) ; ++idxGroup1){ @@ -558,29 +577,11 @@ auto BenchCore( const int NbLoops, const int MinPartsPerGroup, const int MaxPart const int priority = GetPriority<MaxNbDevices,FavorLocality>(false, maxInteractions, minInteractions, group1.getNbParticles()*group2.getNbParticles()).getPriority(); - #pragma omp task depend(mutexinout:group1, group2) firstprivate(group1, group2) priority(priority) default(none) - { - if(isCpuWorker){ - group1.compute(group2); - } - #ifdef SPECX_COMPILE_WITH_CUDA - else{ - auto descr1 = group1.memmovHostToDevice(threadStream); - auto descr2 = group2.memmovHostToDevice(threadStream); - p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>> - (descr2.getRawPtr(), descr2.getRawSize(), descr1.getRawPtr(), descr1.getRawSize()); - p2p_neigh_gpu<<<inKernelConfig.nbBlocksOuter,inKernelConfig.nbThreadsOuter,0,threadStream>>> - (descr1.getRawPtr(), descr1.getRawSize(), descr2.getRawPtr(), descr2.getRawSize()); - group1.memmovDeviceToHost(descr1, threadStream); - group2.memmovDeviceToHost(descr2, threadStream); - CUDA_ASSERT(cudaStreamSynchronize(threadStream)); - } - #endif - } + starpu_insert_task(&cl_two_data, STARPU_PRIORITY, priority, STARPU_RW, handles[idxGroup1], STARPU_RW, handles[idxGroup2], 0); } // Wait for all tasks to be done - starpu_wait_for_all(); + starpu_task_wait_for_all(); timer.stop(); starpu_shutdown(); -- GitLab