Commit f8a5c49a authored by PIACIBELLO Cyrille's avatar PIACIBELLO Cyrille
Browse files
parents ad26bd45 f7829bfb
......@@ -276,6 +276,8 @@ endif()
##################################################################
##################################################################
#
MESSAGE( STATUS "ScalFMM_USE_STARPU = ${ScalFMM_USE_STARPU}" )
if( ScalFMM_USE_STARPU )
SET(STARPU_LIBRARIES " -L$ENV{STARPU_LIB}; -lstarpu-$ENV{STARPU_VERSION}") # CACHE STRING "Set your STARPU flags"
......@@ -289,8 +291,29 @@ if( ScalFMM_USE_STARPU )
MESSAGE(STATUS " STARPU_LIBRARIES = ${STARPU_LIBRARIES}")
MESSAGE(STATUS " STARPU_INCLUDES = ${STARPU_INCLUDES}")
OPTION( ScalFMM_USE_CUDA "Set to ON to use CUDA with StarPU" OFF )
MESSAGE( STATUS "ScalFMM_USE_CUDA = ${ScalFMM_USE_CUDA}" )
if(ScalFMM_USE_CUDA)
execute_process(COMMAND nvcc --version ERROR_VARIABLE cuda_error_output OUTPUT_QUIET)
if(cuda_error_output)
message( FATAL_ERROR "nvcc is needed with CUDA." )
endif()
if(NOT DEFINED CUSTOM_CUDA_FLAGS)
SET( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20" CACHE STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
endif()
# This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
SEPARATE_ARGUMENTS(CUSTOM_CUDA_FLAGS)
MESSAGE( STATUS "CUSTOM_CUDA_FLAGS = ${CUSTOM_CUDA_FLAGS}" )
# Add libcudart and cuda.h
# link_directories($ENV{CUDA_LIB})
include_directories($ENV{CUDA_INC})
SET(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES}; -L$ENV{CUDA_LIB}; -lcudart")
endif()
endif(ScalFMM_USE_STARPU)
list(APPEND FUSE_LIST "STARPU")
list(APPEND FUSE_LIST "CUDA")
#
##################################################################
# Use SSE #
......
......@@ -17,11 +17,35 @@ file(
./*.cpp
)
# Add CUDA files once they are compiled from cu to .o
if(ScalFMM_USE_CUDA)
# Find all the CU files in my project
file(GLOB_RECURSE source_cu_files ./*.cu)
# Iterate and add builind command for each file
set( SCALFMM_CUDA_SOURCES "" )
FOREACH (_file ${source_cu_files})
GET_FILENAME_COMPONENT (_filewe ${_file} NAME_WE)
SET (_filehpp_output ${CMAKE_CURRENT_BINARY_DIR}/${_filewe}.o)
ADD_CUSTOM_COMMAND(OUTPUT ${_filehpp_output}
DEPENDS ${_file}
COMMAND echo ARGS Compiling ${_filewe}
COMMAND nvcc ARGS -c ${_file} -o ${_filehpp_output} ${CUSTOM_CUDA_FLAGS})
LIST (APPEND SCALFMM_CUDA_SOURCES ${_filehpp_output})
ENDFOREACH ()
MESSAGE( STATUS "SCALFMM_CUDA_SOURCES = ${SCALFMM_CUDA_SOURCES}" )
endif()
# Adding cpp files to project
add_library(
scalfmm
STATIC
${source_lib_files}
${SCALFMM_CUDA_SOURCES}
)
# Add blas library (even if it is set to off)
......
......@@ -19,6 +19,9 @@
#include <cstddef>
#include "FBasicCell.hpp"
// To get access to descriptors
struct FTestCellDescriptor;
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FBasicCell*
......@@ -111,6 +114,9 @@ public:
int getSavedSizeUp() {
return int(sizeof(long long int));
}
// To get access to descriptor
friend struct FTestCellDescriptor;
};
......
This diff is collapsed.
// @SCALFMM_PRIVATE
#ifndef FCUDADEVICEWRAPPER_HPP
#define FCUDADEVICEWRAPPER_HPP
#include "../../Utils/FGlobal.hpp"
#include "../FOutOfBlockInteraction.hpp"
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t leafCellsSize,
unsigned char* containersPtr, std::size_t containersSize,
CudaKernelClass* kernel);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9],
CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__transferInoutPassCallbackMpi(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
CudaKernelClass* kernel, int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
CudaKernelClass* kernel, int idxLevel);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
CudaKernelClass* kernel, int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9],
CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__directInoutPassCallbackMpi(unsigned char* containersPtr, std::size_t containersSize,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
CudaKernelClass* kernel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int treeHeight);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__directInPassCallback(unsigned char* containersPtr, std::size_t containersSize,
CudaKernelClass* kernel, const int treeHeight);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__directInoutPassCallback(unsigned char* containersPtr, std::size_t containersSize,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
CudaKernelClass* kernel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int treeHeight);
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
void FCuda__mergePassCallback(unsigned char* leafCellsPtr, std::size_t leafCellsSize,
unsigned char* containersPtr, std::size_t containersSize,
CudaKernelClass* kernel);
template <class CudaKernelClass>
CudaKernelClass* FCuda__BuildCudaKernel(void*);
template <class CudaKernelClass>
void FCuda__ReleaseCudaKernel(CudaKernelClass*);
#endif
/// @SCALFMM_PRIVATE
#ifndef FCUDAEMPTYKERNEL_HPP
#define FCUDAEMPTYKERNEL_HPP
#include "FCudaGlobal.hpp"
#include "FCudaGroupAttachedLeaf.hpp"
#include "../../Components/FTestCell.hpp"
/**
* This class defines what should be a Cuda kernel.
*/
template <class ContainerClass = FCudaGroupAttachedLeaf<0, int>>
class FCudaEmptyKernel {
protected:
public:
__device__ void P2M(unsigned char* const /*pole*/, const ContainerClass* const /*particles*/) {
}
__device__ void M2M(unsigned char* const /*pole*/, const unsigned char *const *const /*child*/, const int /*level*/) {
}
__device__ void M2L(unsigned char* const /*pole*/, const unsigned char* /*distantNeighbors*/[343],
const int /*size*/, const int /*level*/) {
}
__device__ void L2L(const unsigned char*const /*local*/, unsigned char* *const /*child*/, const int /*level*/) {
}
__device__ void L2P(const unsigned char* const /*local*/, ContainerClass*const /*particles*/){
}
__device__ void P2P(const int3& ,
ContainerClass* const /*targets*/, const ContainerClass* const /*sources*/,
ContainerClass* const /*directNeighborsParticles*/[27], const int ){
}
__device__ void P2PRemote(const int3& ,
ContainerClass* const /*targets*/, const ContainerClass* const /*sources*/,
ContainerClass* const /*directNeighborsParticles*/[27], const int ){
}
__device__ MortonIndex getMortonIndex(const unsigned char* /*cell*/) const{
return 0;
}
__device__ int3 getCoordinate(const unsigned char* /*cell*/) const{
int3 coord;
coord.x = coord.y = coord.z = 0;
return coord;
}
__host__ static FCudaEmptyKernel* InitKernelKernel(void*){
return nullptr;
}
__host__ static void ReleaseKernel(FCudaEmptyKernel* /*todealloc*/){
// nothing to do
}
};
#endif // FCUDAEMPTYKERNEL_HPP
// @SCALFMM_PRIVATE
#ifndef FCUDAGLOBAL_HPP
#define FCUDAGLOBAL_HPP
#include "../../Utils/FGlobal.hpp"
// Manage special case for nvcc
#if defined(__CUDACC__) || defined(__NVCC__)
#else
#endif
#include <cuda.h>
#endif // FCUDAGLOBAL_HPP
// @SCALFMM_PRIVATE
#ifndef FCUDAGROUPATTACHEDLEAF_HPP
#define FCUDAGROUPATTACHEDLEAF_HPP
#include "FCudaGlobal.hpp"
template <unsigned NbAttributesPerParticle, class AttributeClass = FReal>
class FCudaGroupAttachedLeaf {
protected:
//< Nb of particles in the current leaf
int nbParticles;
//< Pointers to the positions of the particles
FReal* positionsPointers[3];
//< Pointers to the attributes of the particles
AttributeClass* attributes[NbAttributesPerParticle];
public:
/** Empty constructor to point to nothing */
__device__ FCudaGroupAttachedLeaf() : nbParticles(-1) {
memset(positionsPointers, 0, sizeof(FReal*) * 3);
memset(attributes, 0, sizeof(AttributeClass*) * NbAttributesPerParticle);
}
/**
* @brief FCudaGroupAttachedLeaf
* @param inNbParticles the number of particles in the leaf
* @param inPositionBuffer the memory address of the X array of particls
* @param inLeadingPosition each position is access by inPositionBuffer + in bytes inLeadingPosition*idx
* @param inAttributesBuffer the memory address of the first attribute
* @param inLeadingAttributes each attribute is access by inAttributesBuffer + in bytes inLeadingAttributes*idx
*/
__device__ FCudaGroupAttachedLeaf(const int inNbParticles, FReal* inPositionBuffer, const size_t inLeadingPosition,
AttributeClass* inAttributesBuffer, const size_t inLeadingAttributes)
: nbParticles(inNbParticles){
// Redirect pointers to position
positionsPointers[0] = inPositionBuffer;
positionsPointers[1] = reinterpret_cast<FReal*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition);
positionsPointers[2] = reinterpret_cast<FReal*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*2);
// Redirect pointers to data
for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inAttributesBuffer) + idxAttribute*inLeadingAttributes);
}
}
/** Copy the attached group to another one (copy the pointer not the content!) */
__device__ FCudaGroupAttachedLeaf(const FCudaGroupAttachedLeaf& other) : nbParticles(other.nbParticles) {
positionsPointers[0] = other.positionsPointers[0];
positionsPointers[1] = other.positionsPointers[1];
positionsPointers[2] = other.positionsPointers[2];
// Redirect pointers to data
for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
attributes[idxAttribute] = other.attributes[idxAttribute];
}
}
/** Copy the attached group to another one (copy the pointer not the content!) */
__device__ FCudaGroupAttachedLeaf& operator=(const FCudaGroupAttachedLeaf& other){
nbParticles = (other.nbParticles);
positionsPointers[0] = other.positionsPointers[0];
positionsPointers[1] = other.positionsPointers[1];
positionsPointers[2] = other.positionsPointers[2];
// Redirect pointers to data
for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
attributes[idxAttribute] = other.attributes[idxAttribute];
}
return (*this);
}
/**
* @brief getNbParticles
* @return the number of particles in the leaf
*/
__device__ int getNbParticles() const{
return nbParticles;
}
/**
* @brief getPositions
* @return a FReal*[3] to get access to the positions
*/
__device__ const FReal*const* getPositions() const {
return positionsPointers;
}
/**
* @brief getWPositions
* @return get the position in write mode
*/
__device__ FReal* const* getWPositions() {
return positionsPointers;
}
/**
* @brief getAttribute
* @param index
* @return the attribute at index index
*/
__device__ AttributeClass* getAttribute(const int index) {
return attributes[index];
}
/**
* @brief getAttribute
* @param index
* @return
*/
__device__ const AttributeClass* getAttribute(const int index) const {
return attributes[index];
}
/**
* Get the attribute with a forcing compile optimization
*/
template <int index>
__device__ AttributeClass* getAttribute() {
static_assert(index < NbAttributesPerParticle, "Index to get attributes is out of scope.");
return attributes[index];
}
/**
* Get the attribute with a forcing compile optimization
*/
template <int index>
__device__ const AttributeClass* getAttribute() const {
static_assert(index < NbAttributesPerParticle, "Index to get attributes is out of scope.");
return attributes[index];
}
/** Return true if it has been attached to a memoy block */
__device__ bool isAttachedToSomething() const {
return nbParticles != -1;
}
/** Copy data for one particle (from the ParticleClassContainer into the attached buffer) */
template<class ParticleClassContainer>
__device__ void setParticle(const int destPartIdx, const int srcPartIdx, const ParticleClassContainer* particles){
// Copy position
positionsPointers[0][destPartIdx] = particles->getPositions()[0][srcPartIdx];
positionsPointers[1][destPartIdx] = particles->getPositions()[1][srcPartIdx];
positionsPointers[2][destPartIdx] = particles->getPositions()[2][srcPartIdx];
// Copy data
for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
attributes[idxAttribute][destPartIdx] = particles->getAttribute(idxAttribute)[srcPartIdx];
}
}
};
#endif // FCUDAGROUPATTACHEDLEAF_HPP
// @SCALFMM_PRIVATE
#ifndef FCUDAGROUPOFCELLS_HPP
#define FCUDAGROUPOFCELLS_HPP
#include "FCudaGlobal.hpp"
/**
* @brief The FCudaGroupOfCells class manages the cells in block allocation.
*/
template <const size_t CellClassSize>
class FCudaGroupOfCells {
/** One header is allocated at the beginning of each block */
struct BlockHeader{
MortonIndex startingIndex;
MortonIndex endingIndex;
int numberOfCellsInBlock;
int blockIndexesTableSize;
};
protected:
//< The size of the memoryBuffer
int allocatedMemoryInByte;
//< Pointer to a block memory
unsigned char* memoryBuffer;
//< Pointer to the header inside the block memory
BlockHeader* blockHeader;
//< Pointer to the indexes table inside the block memory
int* blockIndexesTable;
//< Pointer to the cells inside the block memory
unsigned char* blockCells;
//< This value is for not used cells
static const MortonIndex CellIsEmptyFlag = -1;
public:
__device__ FCudaGroupOfCells()
: allocatedMemoryInByte(0), memoryBuffer(nullptr),
blockHeader(nullptr), blockIndexesTable(nullptr), blockCells(nullptr){
}
__device__ void reset(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte){
// Move the pointers to the correct position
allocatedMemoryInByte = (inAllocatedMemoryInByte);
memoryBuffer = (inBuffer);
blockHeader = reinterpret_cast<BlockHeader*>(memoryBuffer);
blockIndexesTable = reinterpret_cast<int*>(memoryBuffer+sizeof(BlockHeader));
blockCells = reinterpret_cast<unsigned char*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->blockIndexesTableSize*sizeof(int)));
}
/**
* Init from a given buffer
* @param inBuffer
* @param inAllocatedMemoryInByte
*/
__device__ FCudaGroupOfCells(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte)
: allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
blockHeader(nullptr), blockIndexesTable(nullptr), blockCells(nullptr){
// Move the pointers to the correct position
blockHeader = reinterpret_cast<BlockHeader*>(memoryBuffer);
blockIndexesTable = reinterpret_cast<int*>(memoryBuffer+sizeof(BlockHeader));
blockCells = reinterpret_cast<unsigned char*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->blockIndexesTableSize*sizeof(int)));
}
/** Call the destructor of cells and dealloc block memory */
__device__ ~FCudaGroupOfCells(){
}
/** Give access to the buffer to send the data */
__device__ const unsigned char* getRawBuffer() const{
return memoryBuffer;
}
/** The the size of the allocated buffer */
__device__ int getBufferSizeInByte() const {
return allocatedMemoryInByte;
}
/** The index of the fist cell (set from the constructor) */
__device__ MortonIndex getStartingIndex() const {
return blockHeader->startingIndex;
}
/** The index of the last cell + 1 (set from the constructor) */
__device__ MortonIndex getEndingIndex() const {
return blockHeader->endingIndex;
}
/** The number of cell (set from the constructor) */
__device__ int getNumberOfCellsInBlock() const {
return blockHeader->numberOfCellsInBlock;
}
/** The size of the interval endingIndex-startingIndex (set from the constructor) */
__device__ int getSizeOfInterval() const {
return blockHeader->blockIndexesTableSize;
}
/** Return true if inIndex should be located in the current block */
__device__ bool isInside(const MortonIndex inIndex) const{
return blockHeader->startingIndex <= inIndex && inIndex < blockHeader->endingIndex;
}
/** Return true if inIndex is located in the current block and is not empty */
__device__ bool exists(const MortonIndex inIndex) const {
return isInside(inIndex) && (blockIndexesTable[inIndex-blockHeader->startingIndex] != CellIsEmptyFlag);
}
/** Return the address of the cell if it exists (or NULL) */
__device__ unsigned char* getCell(const MortonIndex inIndex){
if( exists(inIndex) ) return &blockCells[CellClassSize*blockIndexesTable[inIndex-blockHeader->startingIndex]];
else return nullptr;
}
/** Return the address of the cell if it exists (or NULL) */
__device__ const unsigned char* getCell(const MortonIndex inIndex) const {
if( exists(inIndex) ) return &blockCells[CellClassSize*blockIndexesTable[inIndex-blockHeader->startingIndex]];
else return nullptr;
}
};
#endif // FCUDAGROUPOFCELLS_HPP
// @SCALFMM_PRIVATE
#ifndef FCUDAGROUPOFPARTICLES_HPP
#define FCUDAGROUPOFPARTICLES_HPP
#include "FCudaGlobal.hpp"
template <unsigned NbAttributesPerParticle, class AttributeClass = FReal>
class FCudaGroupOfParticles {
/** One header is allocated at the beginning of each block */
struct BlockHeader{
MortonIndex startingIndex;
MortonIndex endingIndex;
int numberOfLeavesInBlock;
int blockIndexesTableSize;
//< The real number of particles allocated
int nbParticlesAllocatedInGroup;
//< Bytes difference/offset between position
size_t positionOffset;
//< Bytes difference/offset between attributes
size_t attributeOffset;
//< The total number of particles in the group
int nbParticlesInGroup;
};
/** Information about a leaf */
struct LeafHeader {
int nbParticles;
size_t offSet;
};
protected:
static const int MemoryAlignementBytes = 32;
static const int MemoryAlignementParticles = MemoryAlignementBytes/sizeof(FReal);
/** This function return the correct number of particles that should be used to have a correct pack.
* If alignement is 32 and use double (so 4 particles in pack), then this function returns:
* RoundToUpperParticles(1) = 1 + 3 = 4
* RoundToUpperParticles(63) = 63 + 1 = 64
*/
template <class NumClass>
__device__ static NumClass RoundToUpperParticles(const NumClass& nbParticles){
return nbParticles + (MemoryAlignementParticles - (nbParticles%MemoryAlignementParticles));
}
//< This value is for not used leaves
static const int LeafIsEmptyFlag = -1;
//< The size of memoryBuffer in byte
int allocatedMemoryInByte;
//< Pointer to a block memory
unsigned char* memoryBuffer;
//< Pointer to the header inside the block memory
BlockHeader* blockHeader;
//< Pointer to the indexes table inside the block memory
int* blockIndexesTable;
//< Pointer to leaves information
LeafHeader* leafHeader;
//< The total number of particles in the group
const int nbParticlesInGroup;
//< Pointers to particle position x, y, z
FReal* particlePosition[3];
//< Pointers to the particles data inside the block memory
AttributeClass* particleAttributes[NbAttributesPerParticle];