Commit 98d317c0 authored by Quentin Khan's avatar Quentin Khan
Browse files

Merge adaptive FMM development

parents 3079d015 043c96cc
......@@ -395,7 +395,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
# Default is DOUBLE and without THREADS|OMP
find_package(FFTW COMPONENTS SIMPLE) # not REQUIRED
if (FFTW_LIBRARY_DIRS_DEP)
set(FFT_LIBRARIES "-L${FFTW_LIBRARY_DIRS_DEP};" CACHE STRING "Set your MKL flags")
set(FFT_LIBRARIES "-L${FFTW_LIBRARY_DIRS_DEP};" CACHE STRING "Set your FFTW path")
endif()
if (FFTW_LIBRARIES_DEP)
foreach (fft_lib ${FFTW_LIBRARIES_DEP})
......@@ -455,8 +455,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
message( FATAL_ERROR "nvcc is needed with CUDA." )
endif()
if(NOT DEFINED CUSTOM_CUDA_FLAGS)
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;-ptxas-options=-v;-use_fast_math" CACHE
STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;--ptxas-options=-v;-use_fast_math" CACHE
STRING "Set your CUDA flags, for example : -arch=sm_20;--ptxas-options=-v;-use_fast_math")
endif()
# This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
separate_arguments(CUSTOM_CUDA_FLAGS)
......
......@@ -325,6 +325,51 @@ public:
buildHandles();
}
#ifdef STARPU_USE_CPU
void forEachCpuWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CPU, func);
starpu_pause();
}
void forEachCpuWorker(std::function<void(KernelClass*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CPU, [&](){
func(cpuWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
#ifdef SCALFMM_ENABLE_CUDA_KERNEL
void forEachCudaWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CUDA, func);
starpu_pause();
}
void forEachCudaWorker(std::function<void(void*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_CUDA, [&](){
func(cudaWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
#ifdef SCALFMM_ENABLE_OPENCL_KERNEL
void forEachOpenCLWorker(std::function<void(void)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, func);
starpu_pause();
}
void forEachOpenCLWorker(std::function<void(void*)> func){
starpu_resume();
FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
func(openclWrapper.getKernel(starpu_worker_get_id()));
});
starpu_pause();
}
#endif
protected:
/**
* Runs the complete algorithm.
......@@ -1351,6 +1396,9 @@ protected:
}
}
#endif
};
#endif // FGROUPTASKSTARPUALGORITHM_HPP
......@@ -5,18 +5,6 @@
#include "FCudaStructParams.hpp"
static void FCudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}
#define FCudaCheck( test ) { FCudaCheckCore((test), __FILE__, __LINE__); }
#define FCudaCheckAfterCall() { FCudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }
#define FCudaAssertLF(ARGS) if(!(ARGS)){\
printf("Error line %d\n", __LINE__);\
}
#define FMGetOppositeNeighIndex(index) (27-(index)-1)
#define FMGetOppositeInterIndex(index) (343-(index)-1)
......@@ -187,18 +175,15 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
__global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
const MortonIndex blockStartIdx = currentCells.getStartingIndex();
const MortonIndex blockEndIdx = currentCells.getEndingIndex();
for(int cellIdx = 0 ; cellIdx < currentCells.getNumberOfCellsInBlock() ; ++cellIdx){
for(int cellIdx = blockIdx.x ; cellIdx < currentCells.getNumberOfCellsInBlock() ; cellIdx += gridDim.x){
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx);
FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx));
MortonIndex interactionsIndexes[189];
int interactionsPosition[189];
const int3 coord = (FCudaTreeCoordinate::ConvertCoordinate(cell.symb->coordinates));
......@@ -248,33 +233,35 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, CudaKernelClass* kernel){
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr);
if(mode == 1){
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
}
}
}
else{
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
}
}
}
}
......@@ -286,14 +273,20 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
int idxLevel, int mode,
const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions,
CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize){
OutOfBlockInteraction* cuOutsideInteractions;
FCudaCheck( cudaMalloc(&cuOutsideInteractions,nbOutsideInteractions*sizeof(OutOfBlockInteraction)) );
FCudaCheck( cudaMemcpy( cuOutsideInteractions, outsideInteractions, nbOutsideInteractions*sizeof(OutOfBlockInteraction),
cudaMemcpyHostToDevice ) );
int* cuSafeInteractions;
FCudaCheck( cudaMalloc(&cuSafeInteractions,(nbSafeInteractions+1)*sizeof(int)) );
FCudaCheck( cudaMemcpy( cuSafeInteractions, safeInteractions, (nbSafeInteractions+1)*sizeof(int),
cudaMemcpyHostToDevice ) );
FCuda__transferInoutPassPerform
<SymboleCellClass, PoleCellClass, LocalCellClass,
......@@ -302,12 +295,15 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
currentCellsDownPtr,
externalCellsPtr, externalCellsSize,
externalCellsUpPtr,
idxLevel, mode, cuOutsideInteractions,
nbOutsideInteractions, kernel);
idxLevel, mode,
cuOutsideInteractions, nbOutsideInteractions,
cuSafeInteractions, nbSafeInteractions,
kernel);
FCudaCheckAfterCall();
FCudaCheck(cudaStreamSynchronize(currentStream));
FCudaCheck(cudaFree(cuOutsideInteractions));
FCudaCheck(cudaFree(cuSafeInteractions));
}
......@@ -692,7 +688,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
......@@ -782,7 +779,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
......@@ -869,7 +867,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
......@@ -959,7 +958,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
......@@ -1046,7 +1046,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
......@@ -1137,7 +1138,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<float,5>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FUnifCuda<float,5>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FBasicCellPOD, FCudaUnifCellPODPole<float,5>,FCudaUnifCellPODLocal<float,5>, FCudaGroupOfCells<FBasicCellPOD, FCudaUnifCellPODPole<float,5>,FCudaUnifCellPODLocal<float,5>>,
......@@ -1185,6 +1187,7 @@ template void FCuda__ReleaseCudaKernel<FUnifCuda<float,5>>(FUnifCuda<float,5>* c
template dim3 FCuda__GetGridSize< FUnifCuda<float,5> >(FUnifCuda<float,5>* kernel, int intervalSize);
template dim3 FCuda__GetBlockSize< FUnifCuda<float,5> >(FUnifCuda<float,5>* cukernel);
template void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<double,5>& hostData);
......@@ -1224,7 +1227,8 @@ unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<double,5>* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, FUnifCuda<double,5>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FBasicCellPOD, FCudaUnifCellPODPole<double,5>,FCudaUnifCellPODLocal<double,5>, FCudaGroupOfCells<FBasicCellPOD, FCudaUnifCellPODPole<double,5>,FCudaUnifCellPODLocal<double,5>>,
......@@ -1271,3 +1275,5 @@ template void FCuda__ReleaseCudaKernel<FUnifCuda<double,5>>(FUnifCuda<double,5>*
template dim3 FCuda__GetGridSize< FUnifCuda<double,5> >(FUnifCuda<double,5>* kernel, int intervalSize);
template dim3 FCuda__GetBlockSize< FUnifCuda<double,5> >(FUnifCuda<double,5>* cukernel);
template void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<float,5>& hostData);
......@@ -46,7 +46,8 @@ void FCuda__transferInoutPassCallback(
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsDownPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......
......@@ -10,5 +10,19 @@
#include <cuda.h>
#include <cstdio>
static void FCudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}
#define FCudaCheck( test ) { FCudaCheckCore((test), __FILE__, __LINE__); }
#define FCudaCheckAfterCall() { FCudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }
#define FCudaAssertLF(ARGS) if(!(ARGS)){\
printf("Error line %d\n", __LINE__);\
}
#endif // FCUDAGLOBAL_HPP
......@@ -66,6 +66,14 @@ public:
memset(kernels, 0, sizeof(KernelClass*)*STARPU_MAXCPUS);
}
KernelClass* getKernel(const int workerId){
return kernels[workerId];
}
const KernelClass* getKernel(const int workerId) const {
return kernels[workerId];
}
void initKernel(const int workerId, KernelClass* originalKernel){
FAssertLF(kernels[workerId] == nullptr);
kernels[workerId] = new KernelClass(*originalKernel);
......
......@@ -29,6 +29,8 @@
#include "../Cuda/FCudaDeviceWrapper.hpp"
#include "../Uniform/FUnifCudaCellPOD.hpp" // TODO remove
#include "FStarPUUtils.hpp"
template <class KernelClass, class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......@@ -54,6 +56,14 @@ public:
memset(kernels, 0, sizeof(CudaKernelClass*)*STARPU_MAXCUDADEVS);
}
CudaKernelClass* getKernel(const int workerId){
return kernels[workerId];
}
const CudaKernelClass* getKernel(const int workerId) const {
return kernels[workerId];
}
void initKernel(const int workerId, KernelClass* originalKernel){
FAssertLF(kernels[workerId] == nullptr);
kernels[workerId] = FCuda__BuildCudaKernel<CudaKernelClass>(originalKernel);
......@@ -75,7 +85,7 @@ public:
int intervalSize;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__bottomPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -99,7 +109,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -124,7 +134,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInoutPassCallbackMpi< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -149,7 +159,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -168,8 +178,34 @@ public:
int intervalSize = 0;
int mode = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
const OutOfBlockInteraction* interactions;
std::unique_ptr<int[]> safeInteractions(new int[nbInteractions+1]);
int nbSafeInteractions = 0;
std::unique_ptr<OutOfBlockInteraction[]> insideInteractions;
if(mode == 0){
interactions = outsideInteractions->data();
nbSafeInteractions = GetClusterOfInteractionsOutside(safeInteractions.get(), outsideInteractions->data(), nbInteractions);
}
else{
insideInteractions.reset(new OutOfBlockInteraction[nbInteractions]);
memcpy(insideInteractions.get(), outsideInteractions->data(), nbInteractions*sizeof(OutOfBlockInteraction));
FQuickSort<OutOfBlockInteraction>::QsSequential(insideInteractions.get(), nbInteractions,
[](const OutOfBlockInteraction& inter1, const OutOfBlockInteraction& inter2){
// Could be insideIndex since the block are in morton order
return inter1.insideIdxInBlock <= inter2.insideIdxInBlock;
});
interactions = insideInteractions.get();
nbSafeInteractions = GetClusterOfInteractionsInside(safeInteractions.get(), insideInteractions.get(), nbInteractions);
}
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
FCuda__transferInoutPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -179,7 +215,9 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
idxLevel, mode, interactions, nbInteractions,
safeInteractions.get(), nbSafeInteractions,
kernel,
starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -194,7 +232,7 @@ public:
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......@@ -220,7 +258,7 @@ public:
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
std::unique_ptr<int[]> safeOuterInteractions(new int[nbInteractions+1]);
const int counterOuterCell = GetClusterOfInteractionsOutside(safeOuterInteractions.get(), outsideInteractions->data(), nbInteractions);
......@@ -234,7 +272,7 @@ public:
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
outsideInteractions->data(), nbInteractions,
safeOuterInteractions.get(), counterOuterCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream(),
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
#endif
......@@ -246,14 +284,14 @@ public:
FStarPUPtrInterface* worker = nullptr;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__directInPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight, kernel, starpu_cuda_get_local_stream(),
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight, kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -310,7 +348,7 @@ public:
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
......@@ -341,7 +379,7 @@ public:
safeOuterInteractions.get(), counterOuterCell,
insideInteractions.get(),
safeInnterInteractions.get(), counterInnerCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight,
worker->get<ThisClass>(FSTARPU_CUDA_IDX)->treeHeight,
kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -356,7 +394,7 @@ public:
int intervalSize;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CUDA_IDX)->kernels[starpu_worker_get_id()];
FCuda__mergePassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
......
......@@ -24,6 +24,7 @@ class FStarPUFmmPriorities{
int insertionPositionM2L;
int insertionPositionM2LExtern;
int insertionPositionM2LLastLevel;
int insertionPositionM2LExternLastLevel;
int insertionPositionL2L;
int insertionPositionL2P;
int insertionPositionP2P;
......@@ -90,13 +91,16 @@ public:
insertionPositionM2LLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L last " << insertionPositionM2LLastLevel << "\n" );
insertionPositionM2LExternLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L extern last " << insertionPositionM2LExternLastLevel << "\n" );
insertionPositionL2P = incPrio++;
FLOG( FLog::Controller << "\t L2P " << insertionPositionL2P << "\n" );
insertionPositionP2PExtern = incPrio++;
FLOG( FLog::Controller << "\t P2P Outer " << insertionPositionP2PExtern << "\n" );
assert(incPrio == 8 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
assert(incPrio == 9 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
}
else{
int incPrio = 0;
......@@ -110,6 +114,7 @@ public:
insertionPositionM2L = -1;
insertionPositionM2LExtern = -1;
insertionPositionM2LLastLevel = -1;
insertionPositionM2LExternLastLevel = -1;
insertionPositionL2L = -1;
......@@ -158,7 +163,8 @@ public:
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if(capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevelExtern = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CPU prio M2L extern " << cpuCountPrio << " bucket " << prioM2LAtLevelExtern << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevelExtern;
......@@ -186,6 +192,12 @@ public:
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevel = getInsertionPosM2LExtern(treeHeight-1);
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CPU_IDX)){
FLOG( FLog::Controller << "\t CPU prio L2P " << cpuCountPrio << " bucket " << insertionPositionL2P << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = insertionPositionL2P;
......@@ -291,13 +303,28 @@ public:
if(!workOnlyOnLeaves && capacities->supportM2L(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LAtLevel = getInsertionPosM2L(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
FLOG( FLog::Controller << "\t CUDA prio M2L " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets