diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu index 455c176384fa4e4d932d4309e1ff00b8271dfbff..628aee7f23f030d9f1c7587df9a594ee31c9762a 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu @@ -2,7 +2,7 @@ #include "FCudaDeviceWrapper.hpp" #include "FCudaTreeCoordinate.hpp" - +#include "FCudaStructParams.hpp" static const int nbCudaThreads = 32; static const int nbCudaBlocks = 1; @@ -67,12 +67,12 @@ __host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*, 9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel){ CellContainerClass currentCells(currentCellsPtr, currentCellsSize); CellContainerClass subCellGroups[9]; for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr[idx], subCellGroupsSize[idx]); + subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx]); } FCudaAssertLF(nbSubCellGroups != 0); @@ -107,7 +107,7 @@ __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::si template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*, 9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream){ FCuda__upwardPassPerform @@ -290,13 +290,13 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*, 9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel){ FCudaAssertLF(nbSubCellGroups != 0); CellContainerClass currentCells(currentCellsPtr, currentCellsSize); CellContainerClass subCellGroups[9]; for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr[idx], subCellGroupsSize[idx]); + subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx]); } const MortonIndex blockStartIdx = FCudaMax(currentCells.getStartingIndex(), @@ -330,7 +330,7 @@ __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::s template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*, 9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream){ FCuda__downardPassPerform @@ -566,7 +566,7 @@ template void FCuda__bottomPassCallback<FCudaEmptyCell, FCudaGroupOfCells<FCudaE template void FCuda__upwardPassCallback<FCudaEmptyCell, FCudaGroupOfCells<FCudaEmptyCell>, FCudaGroupOfParticles<0, int>, FCudaGroupAttachedLeaf<0, int>, FCudaEmptyKernel<FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, FCudaEmptyKernel< FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>>* kernel, cudaStream_t currentStream); template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCell, FCudaGroupOfCells<FCudaEmptyCell>, FCudaGroupOfParticles<0, int>, FCudaGroupAttachedLeaf<0, int>, FCudaEmptyKernel<FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>> > @@ -587,7 +587,7 @@ template void FCuda__transferInoutPassCallback<FCudaEmptyCell, FCudaGroupOfCells template void FCuda__downardPassCallback<FCudaEmptyCell, FCudaGroupOfCells<FCudaEmptyCell>, FCudaGroupOfParticles<0, int>, FCudaGroupAttachedLeaf<0, int>, FCudaEmptyKernel<FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, FCudaEmptyKernel< FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>>* kernel, cudaStream_t currentStream); template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCell, FCudaGroupOfCells<FCudaEmptyCell>, FCudaGroupOfParticles<0, int>, FCudaGroupAttachedLeaf<0, int>, FCudaEmptyKernel<FCudaEmptyCell, FCudaGroupAttachedLeaf<0, int>> > @@ -628,7 +628,7 @@ template void FCuda__bottomPassCallback<FTestCellPODCore, FCudaGroupOfCells<FTes template void FCuda__upwardPassCallback<FTestCellPODCore, FCudaGroupOfCells<FTestCellPODCore>, FCudaGroupOfParticles<2, long long int>, FCudaGroupAttachedLeaf<2, long long int>, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>>* kernel, cudaStream_t currentStream); template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FCudaGroupOfCells<FTestCellPODCore>, FCudaGroupOfParticles<2, long long int>, FCudaGroupAttachedLeaf<2, long long int>, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>> > @@ -649,7 +649,7 @@ template void FCuda__transferInoutPassCallback<FTestCellPODCore, FCudaGroupOfCel template void FCuda__downardPassCallback<FTestCellPODCore, FCudaGroupOfCells<FTestCellPODCore>, FCudaGroupOfParticles<2, long long int>, FCudaGroupAttachedLeaf<2, long long int>, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>>* kernel, cudaStream_t currentStream); template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FCudaGroupOfCells<FTestCellPODCore>, FCudaGroupOfParticles<2, long long int>, FCudaGroupAttachedLeaf<2, long long int>, FTestCudaKernels<FTestCellPODCore, FCudaGroupAttachedLeaf<2, long long int>> > diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp index 1dd4a84fc8b2ec2bb6ac0e386f538096e4dae4e0..a56fea94b3e56fa9e8fb601c9e59c4e0988e9e74 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp @@ -5,6 +5,7 @@ #include "../../Utils/FGlobal.hpp" #include "../FOutOfBlockInteraction.hpp" +#include "FCudaStructParams.hpp" template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t leafCellsSize, @@ -13,7 +14,7 @@ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t leafCell template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream); template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> @@ -34,7 +35,7 @@ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_ template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9], + FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream); template <class CellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> diff --git a/Src/GroupTree/Cuda/FCudaStructParams.hpp b/Src/GroupTree/Cuda/FCudaStructParams.hpp new file mode 100644 index 0000000000000000000000000000000000000000..86d78b00f1653cecc6fa06ef9f79d7e16ef5075a --- /dev/null +++ b/Src/GroupTree/Cuda/FCudaStructParams.hpp @@ -0,0 +1,15 @@ +// @SCALFMM_PRIVATE +#ifndef FCUDASTRUCTPARAMS_HPP +#define FCUDASTRUCTPARAMS_HPP + +#include "../FStarPUDefaultAlign.hpp" +#include "FCudaGlobal.hpp" + +template <class ArrayType, const int Size> +struct alignas(FStarPUDefaultAlign::StructAlign) FCudaParams{ + ArrayType values[Size]; +}; + + +#endif // FCUDASTRUCTPARAMS_HPP + diff --git a/Src/GroupTree/FStarPUCudaWrapper.hpp b/Src/GroupTree/FStarPUCudaWrapper.hpp index 03a39523ed84f160835ab5749f0ca619c9cc77f4..a6a954a273e8e9c1810c75352bb327d202aed17c 100644 --- a/Src/GroupTree/FStarPUCudaWrapper.hpp +++ b/Src/GroupTree/FStarPUCudaWrapper.hpp @@ -101,13 +101,13 @@ public: int idxLevel = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel); - unsigned char* subCellGroupsPtr[9] ; - memset(subCellGroupsPtr, 0, 9*sizeof(unsigned char*)); - size_t subCellGroupsSize[9] ; - memset(subCellGroupsPtr, 0, 9*sizeof(unsigned char*)); + FCudaParams<unsigned char*,9> subCellGroupsPtr; + memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); + FCudaParams<std::size_t,9> subCellGroupsSize; + memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1])); - subCellGroupsSize[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1]); + subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1])); + subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1]); } CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -199,13 +199,13 @@ public: int idxLevel = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel); - unsigned char* subCellGroupsPtr[9]; - memset(subCellGroupsPtr, 0, 9*sizeof(unsigned char*)); - size_t subCellGroupsSize[9]; - memset(subCellGroupsPtr, 0, 9*sizeof(size_t)); + FCudaParams<unsigned char*,9> subCellGroupsPtr; + memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); + FCudaParams<std::size_t,9> subCellGroupsSize; + memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1])); - subCellGroupsSize[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1])); + subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1])); + subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1])); } CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; diff --git a/Src/GroupTree/FStarPUDefaultAlign.hpp b/Src/GroupTree/FStarPUDefaultAlign.hpp index 077244eef55b87931cdae2d14ac87cda1186f133..5f090c5e7874b0bb7517cf4a5c8da66bf917ad91 100644 --- a/Src/GroupTree/FStarPUDefaultAlign.hpp +++ b/Src/GroupTree/FStarPUDefaultAlign.hpp @@ -1,3 +1,4 @@ +// @SCALFMM_PRIVATE #ifndef FSTARPUDEFAULTALIGN_HPP #define FSTARPUDEFAULTALIGN_HPP diff --git a/Src/GroupTree/FTestCellPOD.hpp b/Src/GroupTree/FTestCellPOD.hpp index ed782c5933d8e029d98891566108bff499cd560d..c253dae551caa54afec12e67dc6de8a91dc4e065 100644 --- a/Src/GroupTree/FTestCellPOD.hpp +++ b/Src/GroupTree/FTestCellPOD.hpp @@ -1,3 +1,4 @@ +// @SCALFMM_PRIVATE #ifndef FTESTCELLPOD_HPP #define FTESTCELLPOD_HPP diff --git a/Src/Utils/FGlobal.hpp b/Src/Utils/FGlobal.hpp index 2891480e1b4c5df0e3c14e6c86a7bca1c47080ec..7a44c1bff7ee23a66c2493583aac8a6208ba052f 100755 --- a/Src/Utils/FGlobal.hpp +++ b/Src/Utils/FGlobal.hpp @@ -51,7 +51,7 @@ typedef float FReal; #endif -typedef long long FSize; +typedef long long int FSize; /////////////////////////////////////////////////////// // Restrict