Commit 98d317c0 authored by Quentin Khan's avatar Quentin Khan

Merge adaptive FMM development

parents 3079d015 043c96cc
......@@ -395,7 +395,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
# Default is DOUBLE and without THREADS|OMP
find_package(FFTW COMPONENTS SIMPLE) # not REQUIRED
if (FFTW_LIBRARY_DIRS_DEP)
set(FFT_LIBRARIES "-L${FFTW_LIBRARY_DIRS_DEP};" CACHE STRING "Set your MKL flags")
set(FFT_LIBRARIES "-L${FFTW_LIBRARY_DIRS_DEP};" CACHE STRING "Set your FFTW path")
endif()
if (FFTW_LIBRARIES_DEP)
foreach (fft_lib ${FFTW_LIBRARIES_DEP})
......@@ -455,8 +455,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
message( FATAL_ERROR "nvcc is needed with CUDA." )
endif()
if(NOT DEFINED CUSTOM_CUDA_FLAGS)
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;-ptxas-options=-v;-use_fast_math" CACHE
STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;--ptxas-options=-v;-use_fast_math" CACHE
STRING "Set your CUDA flags, for example : -arch=sm_20;--ptxas-options=-v;-use_fast_math")
endif()
# This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
separate_arguments(CUSTOM_CUDA_FLAGS)
......
......@@ -325,6 +325,51 @@ public:
buildHandles();
}
#ifdef STARPU_USE_CPU
void forEachCpuWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CPU, func);
starpu_pause();
}
void forEachCpuWorker(std::function<void(KernelClass*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
func(cpuWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
void forEachCudaWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CUDA, func);
starpu_pause();
}
void forEachCudaWorker(std::function<void(void*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
func(cudaWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
void forEachOpenCLWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, func);
starpu_pause();
}
void forEachOpenCLWorker(std::function<void(void*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
func(openclWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
protected:
/**
* Runs the complete algorithm.
......@@ -1351,6 +1396,9 @@ protected:
}
}
#endif
};
#endif // FGROUPTASKSTARPUALGORITHM_HPP
This diff is collapsed.
......@@ -46,7 +46,8 @@ void FCuda__transferInoutPassCallback(
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsDownPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......
......@@ -10,5 +10,19 @@
#include <cuda.h>
#include <cstdio>
static void FCudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}
#define FCudaCheck( test ) { FCudaCheckCore((test), __FILE__, __LINE__); }
#define FCudaCheckAfterCall() { FCudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }
#define FCudaAssertLF(ARGS) if(!(ARGS)){\
printf("Error line %d\n", __LINE__);\
}
#endif // FCUDAGLOBAL_HPP
......@@ -66,6 +66,14 @@ public:
memset(kernels, 0, sizeof(KernelClass*)*STARPU_MAXCPUS);
}
KernelClass* getKernel(const int workerId){
return kernels[workerId];
}
const KernelClass* getKernel(const int workerId) const {
return kernels[workerId];
}
void initKernel(const int workerId, KernelClass* originalKernel){
FAssertLF(kernels[workerId] == nullptr);
kernels[workerId] = new KernelClass(*originalKernel);
......
......@@ -29,6 +29,8 @@
#include "../Cuda/FCudaDeviceWrapper.hpp"
#include "../Uniform/FUnifCudaCellPOD.hpp" // TODO remove
#include "FStarPUUtils.hpp"
template <class KernelClass, class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......@@ -54,6 +56,14 @@ public:
memset(kernels, 0, sizeof(CudaKernelClass*)*STARPU_MAXCUDADEVS);
}
CudaKernelClass* getKernel(const int workerId){
return kernels[workerId];
}
const CudaKernelClass* getKernel(const int workerId) const {
return kernels[workerId];
}
void initKernel(const int workerId, KernelClass* originalKernel){
FAssertLF(kernels[workerId] == nullptr);
kernels[workerId] = FCuda__BuildCudaKernel<CudaKernelClass>(originalKernel);
......@@ -75,7 +85,7 @@ public:
int intervalSize;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__bottomPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -99,7 +109,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -124,7 +134,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInoutPassCallbackMpi< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -149,7 +159,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -168,8 +178,34 @@ public:
int intervalSize = 0;
int mode = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
const OutOfBlockInteraction* interactions;
std::unique_ptr<int[]> safeInteractions(new int[nbInteractions+1]);
int nbSafeInteractions = 0;
std::unique_ptr<OutOfBlockInteraction[]> insideInteractions;
if(mode == 0){
interactions = outsideInteractions->data();
nbSafeInteractions = GetClusterOfInteractionsOutside(safeInteractions.get(), outsideInteractions->data(), nbInteractions);
}
else{
insideInteractions.reset(new OutOfBlockInteraction[nbInteractions]);
memcpy(insideInteractions.get(), outsideInteractions->data(), nbInteractions*sizeof(OutOfBlockInteraction));
FQuickSort<OutOfBlockInteraction>::QsSequential(insideInteractions.get(), nbInteractions,
[](const OutOfBlockInteraction& inter1, const OutOfBlockInteraction& inter2){
// Could be insideIndex since the block are in morton order
return inter1.insideIdxInBlock <= inter2.insideIdxInBlock;
});
interactions = insideInteractions.get();
nbSafeInteractions = GetClusterOfInteractionsInside(safeInteractions.get(), insideInteractions.get(), nbInteractions);
}
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInoutPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -179,7 +215,9 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
idxLevel, mode, interactions, nbInteractions,
safeInteractions.get(), nbSafeInteractions,
kernel,
starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -194,7 +232,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -220,7 +258,7 @@ public:
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
std::unique_ptr<int[]> safeOuterInteractions(new int[nbInteractions+1]);
const int counterOuterCell = GetClusterOfInteractionsOutside(safeOuterInteractions.get(), outsideInteractions->data(), nbInteractions);
......@@ -234,7 +272,7 @@ public:
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
outsideInteractions->data(), nbInteractions,
safeOuterInteractions.get(), counterOuterCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream(),
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
#endif
......@@ -246,14 +284,14 @@ public:
FStarPUPtrInterface* worker = nullptr;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__directInPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight, kernel, starpu_cuda_get_local_stream(),
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight, kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -310,7 +348,7 @@ public:
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
......@@ -341,7 +379,7 @@ public:
safeOuterInteractions.get(), counterOuterCell,
insideInteractions.get(),
safeInnterInteractions.get(), counterInnerCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight,
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight,
kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -356,7 +394,7 @@ public:
int intervalSize;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__mergePassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......
......@@ -24,6 +24,7 @@ class FStarPUFmmPriorities{
int insertionPositionM2L;
int insertionPositionM2LExtern;
int insertionPositionM2LLastLevel;
int insertionPositionM2LExternLastLevel;
int insertionPositionL2L;
int insertionPositionL2P;
int insertionPositionP2P;
......@@ -90,13 +91,16 @@ public:
insertionPositionM2LLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L last " << insertionPositionM2LLastLevel << "\n" );
insertionPositionM2LExternLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L extern last " << insertionPositionM2LExternLastLevel << "\n" );
insertionPositionL2P = incPrio++;
FLOG( FLog::Controller << "\t L2P " << insertionPositionL2P << "\n" );
insertionPositionP2PExtern = incPrio++;
FLOG( FLog::Controller << "\t P2P Outer " << insertionPositionP2PExtern << "\n" );
assert(incPrio == 8 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
assert(incPrio == 9 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
}
else{
int incPrio = 0;
......@@ -110,6 +114,7 @@ public:
insertionPositionM2L = -1;
insertionPositionM2LExtern = -1;
insertionPositionM2LLastLevel = -1;
insertionPositionM2LExternLastLevel = -1;
insertionPositionL2L = -1;
......@@ -158,7 +163,8 @@ public:
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if(capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevelExtern = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CPU prio M2L extern " << cpuCountPrio << " bucket " << prioM2LAtLevelExtern << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevelExtern;
......@@ -186,6 +192,12 @@ public:
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevel = getInsertionPosM2LExtern(treeHeight-1);
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CPU_IDX)){
FLOG( FLog::Controller << "\t CPU prio L2P " << cpuCountPrio << " bucket " << insertionPositionL2P << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = insertionPositionL2P;
......@@ -291,13 +303,28 @@ public:
if(!workOnlyOnLeaves && capacities->supportM2L(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LAtLevel = getInsertionPosM2L(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
FLOG( FLog::Controller << "\t CUDA prio M2L " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 10.0f;
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 15.0f;
}
#endif
}
}
if(!workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LExternAtLevel = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LExternAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LExternAtLevel;
heteroprio->buckets[prioM2LExternAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LExternAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LExternAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 5.0f;
}
#endif
}
......@@ -360,7 +387,7 @@ public:
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2L + (inLevel - 2)*3);
}
int getInsertionPosM2LExtern(const int inLevel) const {
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
return (inLevel==treeHeight-1? insertionPositionM2LExternLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
}
int getInsertionPosL2L(const int inLevel) const {
return insertionPositionL2L + (inLevel - 2)*3;
......
......@@ -118,6 +118,54 @@ public:
}
};
template <class BaseClass>
class FStarPUCudaP2PM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
return inPu == FSTARPU_CPU_IDX;
}
public:
using BaseClass::BaseClass;
bool supportP2P(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2L(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
};
template <class BaseClass>
class FStarPUCudaM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
return inPu == FSTARPU_CPU_IDX;
}
public:
using BaseClass::BaseClass;
bool supportM2L(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
};
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
......
......@@ -50,6 +50,14 @@ public:
memset(kernels, 0, sizeof(OpenCLKernelClass*)*STARPU_MAXOPENCLDEVS);
}
OpenCLKernelClass* getKernel(const int workerId){
return kernels[workerId];
}
const OpenCLKernelClass* getKernel(const int workerId) const {
return kernels[workerId];
}
void initKernel(const int workerId, KernelClass* originalKernel){
FAssertLF(kernels[workerId] == nullptr);
kernels[workerId] = new OpenCLKernelClass(treeHeight);
......
......@@ -5,7 +5,7 @@
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaCompositeCell.hpp"
#include "FUnifCudaSharedData.hpp"
#include "FUnifCudaCellPOD.hpp"
#define Min(x,y) ((x)<(y)?(x):(y))
......@@ -20,6 +20,71 @@ class FUnifCuda {
protected:
public:
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FBasicCellPOD,FCudaUnifCellPODPole<FReal,ORDER>,FCudaUnifCellPODLocal<FReal,ORDER> > CellClass;
static const int NB_THREAD_GROUPS = 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 256;
static const int SHARED_MEMORY_SIZE = 512;// 49152
FUnifCudaSharedData<FReal,ORDER> data;
__device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
}
__device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) {
}
__device__ FReal getScaleFactor(const FReal CellWidth) const
{
return FReal(2.) / CellWidth;
}
__device__ void addMul(FCudaUnifComplex<FReal>* __restrict__ res,
const FCudaUnifComplex<FReal>& other, const FCudaUnifComplex<FReal>& another) const {
res->complex[0] += (other.complex[0] * another.complex[0]) - (other.complex[1] * another.complex[1]);
res->complex[1] += (other.complex[0] * another.complex[1]) + (other.complex[1] * another.complex[0]);
}
__device__ void applyFC(const unsigned int idx, const unsigned int, const FReal scale,
const FCudaUnifComplex<FReal> *const __restrict__ FY,
FCudaUnifComplex<FReal> *const __restrict__ FX) const
{
// Perform entrywise product manually
for (unsigned int j = threadIdx.x ; j < data.opt_rc; j += blockDim.x){
FCudaUnifComplex<FReal> FC_scale;
//FComplex<FReal>(scale*FC[idx*opt_rc + j].getReal(), scale*FC[idx*opt_rc + j].getImag()),
FC_scale.complex[0] = scale*data.FC[idx*data.opt_rc + j].complex[0];
FC_scale.complex[1] = scale*data.FC[idx*data.opt_rc + j].complex[1];
addMul(&FX[j], FC_scale, FY[j]);
}
}
__device__ void M2L(CellClass pole, const CellClass* distantNeighbors,
const int* neighborPositions,
const int inSize, const int TreeLevel) {
const FReal CellWidth(data.BoxWidth / FReal(1 << TreeLevel));
const FReal scale(getScaleFactor(CellWidth));
FCudaUnifComplex<FReal>*const TransformedLocalExpansion = pole.down->transformed_local_exp;
for(int idxExistingNeigh = 0 ; idxExistingNeigh < inSize ; ++idxExistingNeigh){
const int idxNeigh = neighborPositions[idxExistingNeigh];
applyFC(idxNeigh, TreeLevel, scale,
distantNeighbors[idxExistingNeigh].up->transformed_multipole_exp,
TransformedLocalExpansion);
}
}
__device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) {
}
__device__ void L2P(const CellClass /*local*/, ContainerClass*const /*particles*/){
}
__device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const FReal& targetPhys,
FReal& forceX, FReal& forceY,FReal& forceZ, FReal& potential,
const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const {
......@@ -43,38 +108,6 @@ public:
potential += inv_distance * sourcesPhys;
}
static double DSqrt(const double val){
return sqrt(val);
}
static float FSqrt(const float val){
return sqrtf(val);
}
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FBasicCellPOD,FCudaUnifCellPODPole<FReal,ORDER>,FCudaUnifCellPODLocal<FReal,ORDER> > CellClass;
static const int NB_THREAD_GROUPS = 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 256;
static const int SHARED_MEMORY_SIZE = 512;// 49152
__device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
}
__device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) {
}
__device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/,
const int* /*neighPositions*/,
const int /*size*/, const int /*level*/) {
}
__device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) {
}
__device__ void L2P(const CellClass /*local*/, ContainerClass*const /*particles*/){
}
__device__ void P2P(const int3& pos,
ContainerClass* const targets, const ContainerClass* const sources,
ContainerClass* const directNeighborsParticles,
......@@ -323,11 +356,15 @@ public:
}
__host__ static FUnifCuda* InitKernelKernel(void*){
return nullptr;
FUnifCuda* cudaKernel = nullptr;
FCudaCheck( cudaMalloc(&cudaKernel,sizeof(FUnifCuda)) );
printf("InitKernelKernel cudaKernel %p\n", cudaKernel);
// Return pointer to application
return cudaKernel;
}
__host__ static void ReleaseKernel(FUnifCuda* /*todealloc*/){
// nothing to do
__host__ static void ReleaseKernel(FUnifCuda* cudaKernel){
FCudaCheck(cudaFree(cudaKernel));
}
__host__ static dim3 GetGridSize(const int /*intervalSize*/){
......@@ -339,5 +376,14 @@ public:
}
};
template <class FReal, int ORDER>
void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<FReal,ORDER>& hostData){
FUnifCudaSharedData<FReal,ORDER>* cudaData = &((FUnifCuda<FReal,ORDER>*)cudaKernel)->data;
FCudaCheck( cudaMemcpy( cudaData, &hostData, sizeof(FUnifCudaSharedData<FReal,ORDER>),
cudaMemcpyHostToDevice ) );
}
#endif // FUNIFCUDA_HPP
......@@ -10,7 +10,7 @@ template <int ORDER> struct CudaTensorTraits
template <class FReal>
struct FCudaUnifComplex {
FReal data[2];
FReal complex[2];
};
template <class FReal, int ORDER, int NRHS = 1, int NLHS = 1, int NVALS = 1>
......
#ifndef FUNIFCUDASHAREDDATA_HPP
#define FUNIFCUDASHAREDDATA_HPP
#include "../Cuda/FCudaGlobal.hpp"
#include "../../Utils/FGlobal.hpp"
#include "../StarPUUtils/FStarPUDefaultAlign.hpp"
#include "FUnifCudaCellPOD.hpp"
template <class FReal, int ORDER>
struct alignas(FStarPUDefaultAlign::StructAlign) FUnifCudaSharedData {
enum {
rc = (2*ORDER-1)*(2*ORDER-1)*(2*ORDER-1),
opt_rc = rc/2+1,
ninteractions = 343,
sizeFc = opt_rc * ninteractions
};
FReal BoxWidth;
FCudaUnifComplex<FReal> FC[sizeFc];
};
template <class FReal, int ORDER>
void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<FReal,ORDER>& hostData);
#endif // FUNIFCUDASHAREDDATA_HPP
......@@ -311,6 +311,9 @@ public:
}
const FComplex<FReal>& getFc(const int i, const int j) const{
return FC[i*opt_rc + j];
}
};
......@@ -488,6 +491,9 @@ public:
Dft.applyDFT(Py,FY);
}
const FComplex<FReal>& getFc(const int i, const int j) const{
return FC[i*opt_rc + j];
}
};
......
......@@ -42,6 +42,7 @@
#include "../../Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp"
#include "../../Src/GroupTree/Cuda/FCudaGroupOfCells.hpp"
#include "../../Src/GroupTree/Uniform/FUnifCudaSharedData.hpp"
#include "../../Src/Utils/FParameterNames.hpp"
......@@ -50,7 +51,7 @@
template <class FReal, int ORDER>
class FUnifCuda;
//#define RANDOM_PARTICLES
#define RANDOM_PARTICLES