Une nouvelle version du portail de gestion des comptes externes sera mise en production lundi 09 août. Elle permettra d'allonger la validité d'un compte externe jusqu'à 3 ans. Pour plus de détails sur cette version consulter : https://doc-si.inria.fr/x/FCeS

Commit b844af75 authored by BRAMAS Berenger's avatar BRAMAS Berenger
Browse files

make the P2P cuda working for several thread group

parent f0494086
This diff is collapsed.
......@@ -63,7 +63,8 @@ void FCuda__directInoutPassCallbackMpi(
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int treeHeight, CudaKernelClass* kernel, cudaStream_t currentStream,
int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell,
const int treeHeight, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......@@ -78,8 +79,11 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
void FCuda__directInoutPassCallback(
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int treeHeight, CudaKernelClass* kernel, cudaStream_t currentStream,
const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
const int safeOuterInteractions[], const int counterOuterCell,
const OutOfBlockInteraction* insideInteractions,
const int safeInnterInteractions[], const int counterInnerCell,
const int treeHeight, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......
......@@ -52,7 +52,7 @@ public:
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass;
static const int SHARE_SIZE = 1;//128;
static const int THREAD_GROUP_SIZE = 1;//128;
__device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
}
......@@ -90,13 +90,13 @@ public:
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
__shared__ FReal sourcesY[SHARE_SIZE];
__shared__ FReal sourcesZ[SHARE_SIZE];
__shared__ FReal sourcesPhys[SHARE_SIZE];
for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += THREAD_GROUP_SIZE){
__shared__ FReal sourcesX[THREAD_GROUP_SIZE];
__shared__ FReal sourcesY[THREAD_GROUP_SIZE];
__shared__ FReal sourcesZ[THREAD_GROUP_SIZE];
__shared__ FReal sourcesPhys[THREAD_GROUP_SIZE];
const int nbCopies = Min(SHARE_SIZE, targets->getNbParticles()-idxCopy);
const int nbCopies = Min(THREAD_GROUP_SIZE, targets->getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = targets->getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = targets->getPositions()[1][threadIdx.x+idxCopy];
......@@ -187,13 +187,13 @@ public:
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
__shared__ FReal sourcesY[SHARE_SIZE];
__shared__ FReal sourcesZ[SHARE_SIZE];
__shared__ FReal sourcesPhys[SHARE_SIZE];
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += THREAD_GROUP_SIZE){
__shared__ FReal sourcesX[THREAD_GROUP_SIZE];
__shared__ FReal sourcesY[THREAD_GROUP_SIZE];
__shared__ FReal sourcesZ[THREAD_GROUP_SIZE];
__shared__ FReal sourcesPhys[THREAD_GROUP_SIZE];
const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
const int nbCopies = Min(THREAD_GROUP_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
......@@ -259,13 +259,13 @@ public:
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
__shared__ FReal sourcesY[SHARE_SIZE];
__shared__ FReal sourcesZ[SHARE_SIZE];
__shared__ FReal sourcesPhys[SHARE_SIZE];
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += THREAD_GROUP_SIZE){
__shared__ FReal sourcesX[THREAD_GROUP_SIZE];
__shared__ FReal sourcesY[THREAD_GROUP_SIZE];
__shared__ FReal sourcesZ[THREAD_GROUP_SIZE];
__shared__ FReal sourcesPhys[THREAD_GROUP_SIZE];
const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
const int nbCopies = Min(THREAD_GROUP_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
......@@ -321,12 +321,12 @@ public:
// nothing to do
}
__host__ static dim3 GetGridSize(const int intervalSize){
return 1; //intervalSize;
__host__ static dim3 GetGridSize(const int /*intervalSize*/){
return 24;
}
__host__ static dim3 GetBlocksSize(){
return SHARE_SIZE;
return THREAD_GROUP_SIZE;
}
};
......
......@@ -218,9 +218,13 @@ public:
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
std::unique_ptr<int[]> safeOuterInteractions(new int[nbInteractions+1]);
const int counterOuterCell = GetClusterOfInteractionsOutside(safeOuterInteractions.get(), outsideInteractions->data(), nbInteractions);
FCuda__directInoutPassCallbackMpi< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
......@@ -228,7 +232,8 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
outsideInteractions->data(), outsideInteractions->size(),
outsideInteractions->data(), nbInteractions,
safeOuterInteractions.get(), counterOuterCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -252,14 +257,76 @@ public:
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
static int GetClusterOfInteractionsInside(int safeOuterInteractions[],
const OutOfBlockInteraction outsideInteractions[],
const int nbInteractions){
safeOuterInteractions[0] = 0;
safeOuterInteractions[1] = 0;
int counterInnerCell = 1;
for(int idxInter = 0 ; idxInter < int(nbInteractions) ; ++idxInter){
if(outsideInteractions[safeOuterInteractions[counterInnerCell]].insideIdxInBlock
!= outsideInteractions[idxInter].insideIdxInBlock){
FAssertLF(outsideInteractions[safeOuterInteractions[counterInnerCell]].insideIdxInBlock
< outsideInteractions[idxInter].insideIdxInBlock);
counterInnerCell += 1;
safeOuterInteractions[counterInnerCell] = safeOuterInteractions[counterInnerCell-1];
}
else{
safeOuterInteractions[counterInnerCell] += 1;
}
}
FAssertLF(safeOuterInteractions[counterInnerCell] == nbInteractions);
return counterInnerCell;
}
static int GetClusterOfInteractionsOutside(int safeOuterInteractions[],
const OutOfBlockInteraction outsideInteractions[],
const int nbInteractions){
safeOuterInteractions[0] = 0;
safeOuterInteractions[1] = 0;
int counterInnerCell = 1;
for(int idxInter = 0 ; idxInter < int(nbInteractions) ; ++idxInter){
if(outsideInteractions[safeOuterInteractions[counterInnerCell]].outsideIdxInBlock
!= outsideInteractions[idxInter].outsideIdxInBlock){
FAssertLF(outsideInteractions[safeOuterInteractions[counterInnerCell]].outsideIdxInBlock
< outsideInteractions[idxInter].outsideIdxInBlock);
counterInnerCell += 1;
safeOuterInteractions[counterInnerCell] = safeOuterInteractions[counterInnerCell-1];
}
else{
safeOuterInteractions[counterInnerCell] += 1;
}
}
FAssertLF(safeOuterInteractions[counterInnerCell] == nbInteractions);
return counterInnerCell;
}
static void directInoutPassCallback(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
const int nbInteractions = int(outsideInteractions->size());
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
std::unique_ptr<int[]> safeOuterInteractions(new int[nbInteractions+1]);
const int counterOuterCell = GetClusterOfInteractionsOutside(safeOuterInteractions.get(), outsideInteractions->data(), nbInteractions);
std::unique_ptr<OutOfBlockInteraction[]> insideInteractions(new OutOfBlockInteraction[nbInteractions]);
memcpy(insideInteractions.get(), outsideInteractions->data(), nbInteractions*sizeof(OutOfBlockInteraction));
FQuickSort<OutOfBlockInteraction>::QsSequential(insideInteractions.get(), nbInteractions,
[](const OutOfBlockInteraction& inter1, const OutOfBlockInteraction& inter2){
// Could be insideIndex since the block are in morton order
return inter1.insideIdxInBlock <= inter2.insideIdxInBlock;
});
std::unique_ptr<int[]> safeInnterInteractions(new int[nbInteractions+1]);
const int counterInnerCell = GetClusterOfInteractionsInside(safeInnterInteractions.get(), insideInteractions.get(), nbInteractions);
FCuda__directInoutPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>(
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
......@@ -268,7 +335,11 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
outsideInteractions->data(), int(outsideInteractions->size()), worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight,
outsideInteractions->data(), nbInteractions,
safeOuterInteractions.get(), counterOuterCell,
insideInteractions.get(),
safeInnterInteractions.get(), counterInnerCell,
worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight,
kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......
......@@ -39,6 +39,7 @@
#include "../../Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp"
#include "../../Src/GroupTree/Cuda/FCudaGroupOfCells.hpp"
#include "../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp"
#include "../../Src/Utils/FParameterNames.hpp"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment