Commit 73cb0ace authored by BRAMAS Berenger's avatar BRAMAS Berenger

update cuda part and sync data with devices (cuda and opencl) it looks like it...

update cuda part and sync data with devices (cuda and opencl) it looks like it works for the test kernel
parent 98cea999
......@@ -68,8 +68,6 @@ protected:
BlockHeader* blockHeader;
//< Pointer to leaves information
LeafHeader* leafHeader;
//< The total number of particles in the group
const FSize nbParticlesInGroup;
//< Pointers to particle position x, y, z
FReal* particlePosition[3];
......@@ -92,7 +90,7 @@ public:
FGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
unsigned char* inAttributes)
: allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
blockHeader(nullptr), leafHeader(nullptr),
attributesBuffer(nullptr), deleteBuffer(false){
// Move the pointers to the correct position
blockHeader = reinterpret_cast<BlockHeader*>(inBuffer);
......@@ -127,12 +125,12 @@ public:
* @param inNumberOfLeaves total number of leaves in the interval (should be <= inEndingIndex-inEndingIndex)
*/
FGroupOfParticles(const MortonIndex inStartingIndex, const MortonIndex inEndingIndex, const int inNumberOfLeaves, const FSize inNbParticles)
: allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(inNbParticles),
: allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr),
deleteBuffer(true){
memset(particlePosition, 0, sizeof(particlePosition));
memset(particleAttributes, 0, sizeof(particleAttributes));
const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(nbParticlesInGroup+(MemoryAlignementParticles-1)*inNumberOfLeaves);
const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(inNbParticles+(MemoryAlignementParticles-1)*inNumberOfLeaves);
// Find the number of leaf to allocate in the blocks
FAssertLF((inEndingIndex-inStartingIndex) >= MortonIndex(inNumberOfLeaves));
......@@ -161,6 +159,7 @@ public:
blockHeader->endingIndex = inEndingIndex;
blockHeader->numberOfLeavesInBlock = inNumberOfLeaves;
blockHeader->nbParticlesAllocatedInGroup = nbParticlesAllocatedInGroup;
blockHeader->nbParticlesInGroup = inNbParticles;
// Init particle pointers
blockHeader->positionsLeadingDim = (sizeof(FReal) * nbParticlesAllocatedInGroup);
......@@ -247,7 +246,7 @@ public:
/** Get the total number of particles in the group */
FSize getNbParticlesInGroup() const {
return nbParticlesInGroup;
return blockHeader->nbParticlesInGroup;
}
/** The size of the interval endingIndex-startingIndex (set from the constructor) */
......
......@@ -250,6 +250,27 @@ public:
#endif
}
void syncData(){
for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].up);
starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].down);
}
}
{
for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
starpu_data_release(particleHandles[idxHandle].symb);
starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
starpu_data_release(particleHandles[idxHandle].down);
}
}
}
~FGroupTaskStarPUAlgorithm(){
starpu_resume();
......@@ -338,6 +359,11 @@ protected:
FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
starpu_task_wait_for_all();
FLOG( FTic timerSync; );
syncData();
FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
starpu_pause();
#ifdef STARPU_USE_CPU
......
......@@ -265,6 +265,27 @@ public:
#endif
}
void syncData(){
for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].up);
starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
starpu_data_release(cellHandles[idxLevel][idxHandle].down);
}
}
{
for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
starpu_data_release(particleHandles[idxHandle].symb);
starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
starpu_data_release(particleHandles[idxHandle].down);
}
}
}
~FGroupTaskStarPUMpiAlgorithm(){
starpu_resume();
......@@ -324,6 +345,7 @@ public:
}
protected:
/**
* Runs the complete algorithm.
*/
......@@ -362,6 +384,11 @@ protected:
#endif
starpu_task_wait_for_all();
FLOG( FTic timerSync; );
syncData();
FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
starpu_pause();
#ifdef STARPU_USE_CPU
......
This diff is collapsed.
......@@ -17,9 +17,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__upwardPassCallback(
unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......@@ -43,10 +42,10 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__transferInoutPassCallback(
unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
unsigned char* currentCellsUpPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
int idxLevel, const OutOfBlockInteraction* outsideInteractions,
unsigned char* externalCellsDownPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
......@@ -54,9 +53,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__downardPassCallback(
unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
......
......@@ -59,8 +59,6 @@ protected:
BlockHeader* blockHeader;
//< Pointer to leaves information
LeafHeader* leafHeader;
//< The total number of particles in the group
const FSize nbParticlesInGroup;
//< Pointers to particle position x, y, z
FReal* particlePosition[3];
......@@ -78,11 +76,12 @@ public:
__device__ FCudaGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
unsigned char* inAttributes)
: allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
blockHeader(nullptr), leafHeader(nullptr),
attributesBuffer(nullptr){
// Move the pointers to the correct position
blockHeader = reinterpret_cast<BlockHeader*>(memoryBuffer);
leafHeader = reinterpret_cast<LeafHeader*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->numberOfLeavesInBlock*sizeof(int)));
blockHeader = reinterpret_cast<BlockHeader*>(inBuffer);
inBuffer += sizeof(BlockHeader);
leafHeader = reinterpret_cast<LeafHeader*>(inBuffer);
// Init particle pointers
// Assert blockHeader->positionsLeadingDim == (sizeof(FReal) * blockHeader->nbParticlesAllocatedInGroup);
......@@ -122,7 +121,7 @@ public:
/** Get the total number of particles in the group */
__device__ FSize getNbParticlesInGroup() const {
return nbParticlesInGroup;
return blockHeader->nbParticlesInGroup;
}
/** The size of the interval endingIndex-startingIndex (set from the constructor) */
......
......@@ -96,21 +96,9 @@ public:
FStarPUPtrInterface* worker = nullptr;
int nbSubCellGroups = 0;
int idxLevel = 0;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
FCudaParams<unsigned char*,9> subCellGroupsPtr;
memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
FCudaParams<std::size_t,9> subCellGroupsSize;
memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
FCudaParams<unsigned char*,9> subCellGroupsUpPtr;
memset(&subCellGroupsUpPtr, 0, sizeof(subCellGroupsUpPtr));
for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]);
subCellGroupsUpPtr.values[idxSubGroup] = (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]);
}
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
......@@ -118,8 +106,10 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
subCellGroupsPtr,subCellGroupsSize,subCellGroupsUpPtr,
nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
idxLevel, kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -130,8 +120,8 @@ public:
static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
int idxLevel = 0;
const std::vector<OutOfBlockInteraction>* outsideInteractions;
int intervalSize;
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......@@ -156,7 +146,7 @@ public:
static void transferInPassCallback(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
int idxLevel = 0;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......@@ -174,9 +164,10 @@ public:
static void transferInoutPassCallback(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
int idxLevel = 0;
const std::vector<OutOfBlockInteraction>* outsideInteractions;
int intervalSize;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize = 0;
int mode = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......@@ -186,11 +177,9 @@ public:
STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[4]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[5]),
idxLevel, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
......@@ -202,21 +191,9 @@ public:
FStarPUPtrInterface* worker = nullptr;
int nbSubCellGroups = 0;
int idxLevel = 0;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
FCudaParams<unsigned char*,9> subCellGroupsPtr;
memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
FCudaParams<std::size_t,9> subCellGroupsSize;
memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
FCudaParams<unsigned char*,9> subCellGroupsDownPtr;
memset(&subCellGroupsDownPtr, 0, sizeof(subCellGroupsDownPtr));
for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]));
subCellGroupsDownPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]));
}
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
......@@ -224,8 +201,10 @@ public:
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
subCellGroupsPtr,subCellGroupsSize,subCellGroupsDownPtr,
nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
(unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
idxLevel, kernel, starpu_cuda_get_local_stream(),
FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
}
/////////////////////////////////////////////////////////////////////////////////////
......@@ -237,7 +216,7 @@ public:
FStarPUPtrInterface* worker = nullptr;
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......@@ -260,7 +239,7 @@ public:
static void directInPassCallback(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......@@ -276,7 +255,7 @@ public:
static void directInoutPassCallback(void *buffers[], void *cl_arg){
FStarPUPtrInterface* worker = nullptr;
const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
int intervalSize;
int intervalSize = 0;
starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
......
......@@ -18,6 +18,8 @@ public:
// the pole represents all particles under
if(threadIdx.x == 0){
*pole.up += particles->getNbParticles();
printf("*pole.up %d += particles->getNbParticles(); %d \n",
*pole.up , particles->getNbParticles());
}
}
......
......@@ -80,7 +80,7 @@ int main(int argc, char* argv[]){
typedef FGroupTestParticleContainer<FReal> GroupContainerClass;
typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,
GroupContainerClass, 0, 1, long long int> GroupOctreeClass;
typedef FStarPUAllCpuCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass;
typedef FStarPUAllCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass;
typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
typedef FStarPUCudaWrapper<GroupKernelClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment