Commit f5d468cc authored by Florent Pruvost's avatar Florent Pruvost

merge with origin/master

parents 36bc62de beec09ae
......@@ -13,33 +13,15 @@ set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
#===========================================================================
project(ScalFMM C CXX)
# directly make an error if in-source build
#if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}")
# message(FATAL_ERROR "In-source builds are not allowed.\n"
# "Please create a build directory first and execute cmake configuration from "
# "this directory. Example: mkdir build && cd build && cmake ..")
#endif()
#
# check if compiling into source directories
#string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource)
#if(insource)
# message(FATAL_ERROR "${PROJECT_NAME} requires an out of source build. Goto ./Build and tapes cmake ../")
#endif(insource)
string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" insource)
if(insource)
message(FATAL_ERROR "${PROJECT_NAME} requires an out of source build. Goto ./Build and tapes cmake ../")
endif(insource)
set(ScalFMM_CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/CMakeModules)
# MPI option has to be set before project, cannot be changed in the cache!
#if( ScalFMM_USE_MPI )
# include(CMakeForceCompiler)
# CMAKE_FORCE_C_COMPILER(mpicc "MPI C Compiler")
# CMAKE_FORCE_CXX_COMPILER(mpicxx "MPI C++ Compiler")
# set(ScalFMM_USE_MPI ON CACHE BOOL "ScalFMM use MPI")
#else()
# message(STATUS "Remove CMake cache and run cmake .. -DScalFMM_USE_MPI=ON to enable MPI" )
#endif(ScalFMM_USE_MPI)
#===========================================================================
# Version Number
#===========================================================================
......@@ -66,12 +48,6 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
endif()
include(MorseInit)
#
# Active language
# -----------------------
# enable_language(CXX)
#
#
# Options
option( ScalFMM_USE_MPI "Set to ON to build ScaFMM with MPI" OFF )
......@@ -411,6 +387,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
##################################################################
##################################################################
#
message( STATUS "ScalFMM_USE_STARPU = ${ScalFMM_USE_STARPU}" )
if( ScalFMM_USE_STARPU )
set(ScalFMM_STARPU_VERSION "1.1" CACHE STRING "oldest STARPU version desired")
......@@ -441,6 +418,26 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
include_directories(${STARPU_INCLUDES})
endif()
option( ScalFMM_USE_CUDA "Set to ON to use CUDA with StarPU" OFF )
message( STATUS "ScalFMM_USE_CUDA = ${ScalFMM_USE_CUDA}" )
if(ScalFMM_USE_CUDA)
execute_process(COMMAND nvcc --version ERROR_VARIABLE cuda_error_output OUTPUT_QUIET)
if(cuda_error_output)
message( FATAL_ERROR "nvcc is needed with CUDA." )
endif()
if(NOT DEFINED CUSTOM_CUDA_FLAGS)
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20" CACHE STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
endif()
# This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
separate_arguments(CUSTOM_CUDA_FLAGS)
message( STATUS "CUSTOM_CUDA_FLAGS = ${CUSTOM_CUDA_FLAGS}" )
# Add libcudart and cuda.h
# link_directories($ENV{CUDA_LIB})
include_directories($ENV{CUDA_INC})
set(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES}; -L$ENV{CUDA_LIB}; -lcudart")
endif()
message(STATUS " STARPU_LIBRARIES = ${STARPU_LIBRARIES}")
if (STARPU_INCLUDE_DIRS)
message(STATUS " STARPU_INCLUDES = ${STARPU_INCLUDES}")
......@@ -448,7 +445,8 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
endif(ScalFMM_USE_STARPU)
list(APPEND FUSE_LIST "STARPU")
#
list(APPEND FUSE_LIST "CUDA")
##################################################################
# Use SSE #
##################################################################
......@@ -479,6 +477,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
endif(${COMPILE_SSE})
endif()
list(APPEND FUSE_LIST "SSE")
##################################################################
# Use AVX #
##################################################################
......
......@@ -20,7 +20,7 @@ template<class OctreeClass,class ParticleClass>
class FAbstractMover{
public:
virtual void getParticlePosition(ParticleClass* lf, const int idxPart, FPoint* particlePos) = 0;
virtual void removeFromLeafAndKeep(ParticleClass* lf, const FPoint& particlePos, const int idxPart) = 0;
virtual void removeFromLeafAndKeep(ParticleClass* lf, const FPoint& particlePos, const int idxPart, FParticleType type) = 0;
virtual void insertAllParticles(OctreeClass* tree) = 0;
};
......
......@@ -74,6 +74,7 @@ public:
octreeIterator.gotoBottomLeft();
do{
const MortonIndex currentMortonIndex = octreeIterator.getCurrentGlobalIndex();
//First we test sources
ContainerClass * particles = octreeIterator.getCurrentLeaf()->getSrc();
for(int idxPart = 0 ; idxPart < particles->getNbParticles(); /*++idxPart*/){
FPoint currentPart;
......@@ -82,15 +83,33 @@ public:
const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart);
if(particuleIndex != currentMortonIndex){
//Need to move this one
interface->removeFromLeafAndKeep(particles,currentPart,idxPart);
interface->removeFromLeafAndKeep(particles,currentPart,idxPart,FParticleTypeSource);
}
else{
//Need to increment idx;
++idxPart;
}
}
//Then we test targets
if(octreeIterator.getCurrentLeaf()->getTargets() != particles){ //Leaf is TypedLeaf
ContainerClass * particleTargets = octreeIterator.getCurrentLeaf()->getTargets();
for(int idxPart = 0 ; idxPart < particleTargets->getNbParticles(); /*++idxPart*/){
FPoint currentPart;
interface->getParticlePosition(particleTargets,idxPart,&currentPart);
checkPosition(currentPart);
const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart);
if(particuleIndex != currentMortonIndex){
//Need to move this one
interface->removeFromLeafAndKeep(particleTargets,currentPart,idxPart, FParticleTypeTarget);
}
else{
//Need to increment idx;
++idxPart;
}
}
}
}while(octreeIterator.moveRight());
printf("Insert back particles\n");
//Insert back the parts that have been removed
interface->insertAllParticles(tree);
......@@ -101,7 +120,8 @@ public:
bool workOnNext = true;
do{
// Empty leaf
if( octreeIterator.getCurrentListTargets()->getNbParticles() == 0 ){
if( octreeIterator.getCurrentListTargets()->getNbParticles() == 0 &&
octreeIterator.getCurrentListSrc()->getNbParticles() == 0 ){
const MortonIndex currentIndex = octreeIterator.getCurrentGlobalIndex();
workOnNext = octreeIterator.moveRight();
tree->removeLeaf( currentIndex );
......
#ifndef FPARTICULETYPEDINDEXEDMOVER_HPP
#define FPARTICULETYPEDINDEXEDMOVER_HPP
#include "FAbstractMover.hpp"
#include "../Containers/FVector.hpp"
/**
* This class should be use with the octree arrange to move particles
* that are typed (src/tgt) and stored in a FBasicParticleContainer
*/
template<class OctreeClass, class ContainerClass >
class FParticleTypedIndexedMover : public FAbstractMover<OctreeClass, ContainerClass>{
private:
ContainerClass toStoreRemovedSourceParts;
ContainerClass toStoreRemovedTargetParts;
public:
FParticleTypedIndexedMover(){
}
virtual ~FParticleTypedIndexedMover(){
}
/** To get the position of the particle at idx idxPart in leaf lf */
void getParticlePosition(ContainerClass* lf, const int idxPart, FPoint* particlePos){
(*particlePos) = FPoint(lf->getPositions()[0][idxPart],lf->getPositions()[1][idxPart],lf->getPositions()[2][idxPart]);
}
/** Remove a particle but keep it to reinsert it later*/
void removeFromLeafAndKeep(ContainerClass* lf, const FPoint& particlePos, const int idxPart, FParticleType type){
std::array<typename ContainerClass::AttributesClass, ContainerClass::NbAttributes> particleValues;
for(int idxAttr = 0 ; idxAttr < ContainerClass::NbAttributes ; ++idxAttr){
particleValues[idxAttr] = lf->getAttribute(idxAttr)[idxPart];
}
if(type == FParticleTypeTarget){
toStoreRemovedTargetParts.push(particlePos,FParticleTypeTarget,lf->getIndexes()[idxPart],particleValues);
}
else{
toStoreRemovedSourceParts.push(particlePos,FParticleTypeSource,lf->getIndexes()[idxPart],particleValues);
}
lf->removeParticles(&idxPart,1);
}
/** Reinsert the previously saved particles */
void insertAllParticles(OctreeClass* tree){
std::array<typename ContainerClass::AttributesClass, ContainerClass::NbAttributes> particleValues;
for(int idxToInsert = 0; idxToInsert<toStoreRemovedSourceParts.getNbParticles() ; ++idxToInsert){
for(int idxAttr = 0 ; idxAttr < ContainerClass::NbAttributes ; ++idxAttr){
particleValues[idxAttr] = toStoreRemovedSourceParts.getAttribute(idxAttr)[idxToInsert];
}
const FPoint particlePos(toStoreRemovedSourceParts.getPositions()[0][idxToInsert],
toStoreRemovedSourceParts.getPositions()[1][idxToInsert],
toStoreRemovedSourceParts.getPositions()[2][idxToInsert]);
tree->insert(particlePos, FParticleTypeSource, toStoreRemovedSourceParts.getIndexes()[idxToInsert], particleValues);
}
for(int idxToInsert = 0; idxToInsert<toStoreRemovedTargetParts.getNbParticles() ; ++idxToInsert){
for(int idxAttr = 0 ; idxAttr < ContainerClass::NbAttributes ; ++idxAttr){
particleValues[idxAttr] = toStoreRemovedTargetParts.getAttribute(idxAttr)[idxToInsert];
}
const FPoint particlePos(toStoreRemovedTargetParts.getPositions()[0][idxToInsert],
toStoreRemovedTargetParts.getPositions()[1][idxToInsert],
toStoreRemovedTargetParts.getPositions()[2][idxToInsert]);
tree->insert(particlePos, FParticleTypeTarget, toStoreRemovedTargetParts.getIndexes()[idxToInsert], particleValues);
}
toStoreRemovedSourceParts.clear();
toStoreRemovedTargetParts.clear();
}
};
#endif //FPARTICULETYPEDINDEXEDMOVER_HPP
......@@ -17,11 +17,35 @@ file(
./*.cpp
)
# Add CUDA files once they are compiled from cu to .o
if(ScalFMM_USE_CUDA)
# Find all the CU files in my project
file(GLOB_RECURSE source_cu_files ./*.cu)
# Iterate and add builind command for each file
set( SCALFMM_CUDA_SOURCES "" )
FOREACH (_file ${source_cu_files})
GET_FILENAME_COMPONENT (_filewe ${_file} NAME_WE)
SET (_filehpp_output ${CMAKE_CURRENT_BINARY_DIR}/${_filewe}.o)
ADD_CUSTOM_COMMAND(OUTPUT ${_filehpp_output}
DEPENDS ${_file}
COMMAND echo ARGS Compiling ${_filewe}
COMMAND nvcc ARGS -c ${_file} -o ${_filehpp_output} ${CUSTOM_CUDA_FLAGS})
LIST (APPEND SCALFMM_CUDA_SOURCES ${_filehpp_output})
ENDFOREACH ()
MESSAGE( STATUS "SCALFMM_CUDA_SOURCES = ${SCALFMM_CUDA_SOURCES}" )
endif()
# Adding cpp files to project
add_library(
scalfmm
STATIC
${source_lib_files}
${SCALFMM_CUDA_SOURCES}
)
# Add blas library (even if it is set to off)
......
......@@ -291,6 +291,16 @@ public:
nbParticles += 1;
}
/**
* Push called by FTypedLeaf Through arranger
* Should have a particle position fallowed by isTarget flag and attributes
*/
template<typename... Args>
void push(const FPoint& inParticlePosition, const FParticleType type,
const std::array<AttributeClass , NbAttributesPerParticle>& values){
push(inParticlePosition,values);
}
/**
* Push called usually by FTypedLeaf with the isTarget flag in addition
*/
......
......@@ -19,6 +19,9 @@
#include <cstddef>
#include "FBasicCell.hpp"
// To get access to descriptors
struct FTestCellDescriptor;
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FBasicCell*
......@@ -111,6 +114,9 @@ public:
int getSavedSizeUp() {
return int(sizeof(long long int));
}
// To get access to descriptor
friend struct FTestCellDescriptor;
};
......
......@@ -4,13 +4,13 @@
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
// abiding by the rules of distribution of free software.
//
// abiding by the rules of distribution of free software.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
#ifndef FFMMALGORITHMTHREADTSM_HPP
......@@ -22,7 +22,7 @@
#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"
#include "../Utils/FAlgorithmTimers.hpp"
#include "../Containers/FOctree.hpp"
#include "FCoreCommon.hpp"
......@@ -45,7 +45,7 @@
* You should not write on sources in the P2P method!
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadTsm : public FAbstractAlgorithm{
class FFmmAlgorithmThreadTsm : public FAbstractAlgorithm, public FAlgorithmTimers{
OctreeClass* const tree; //< The octree to work on
KernelClass** kernels; //< The kernels
......@@ -413,5 +413,3 @@ protected:
#endif //FFMMALGORITHMTHREADTSM_HPP
// @SCALFMM_PRIVATE
#include "FCudaDeviceWrapper.hpp"
#include "FCudaTreeCoordinate.hpp"
static const int nbCudaThreads = 32;
static const int nbCudaBlocks = 1;
static void FCudaCheckCore(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"Cuda Error %d : %s %s %d\n", code, cudaGetErrorString(code), file, line);
exit(code);
}
}
#define FCudaCheck( test ) { FCudaCheckCore((test), __FILE__, __LINE__); }
#define FCudaCheckAfterCall() { FCudaCheckCore((cudaGetLastError()), __FILE__, __LINE__); }
#define FCudaAssertLF(ARGS) ARGS;
#define FMGetOppositeNeighIndex(index) (27-(index)-1)
#define FMGetOppositeInterIndex(index) (343-(index)-1)
#define FCudaMax(x,y) ((x)<(y) ? (y) : (x))
#define FCudaMin(x,y) ((x)>(y) ? (y) : (x))
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__global__ void FCuda__bottomPassPerform(unsigned char* leafCellsPtr, std::size_t leafCellsSize,
unsigned char* containersPtr, std::size_t containersSize,
CudaKernelClass* kernel){
CellContainerClass leafCells(leafCellsPtr, leafCellsSize);
ParticleContainerGroupClass containers(containersPtr, containersSize);
const MortonIndex blockStartIdx = leafCells.getStartingIndex();
const MortonIndex blockEndIdx = leafCells.getEndingIndex();
for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
unsigned char* cell = leafCells.getCell(mindex);
if(cell){
FCudaAssertLF(kernel->getMortonIndex(cell) == mindex);
ParticleGroupClass particles = containers.template getLeaf<ParticleGroupClass>(mindex);
FCudaAssertLF(particles.isAttachedToSomething());
kernel->P2M(cell, &particles);
}
}
}
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t leafCellsSize,
unsigned char* containersPtr, std::size_t containersSize,
CudaKernelClass* kernel){
FCuda__bottomPassPerform
< CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
<<<nbCudaThreads, nbCudaBlocks, 0/*starpu_cuda_get_local_stream()*/>>>
(leafCellsPtr, leafCellsSize,
containersPtr, containersSize,
kernel);
FCudaCheckAfterCall();
}
/////////////////////////////////////////////////////////////////////////////////////
/// Upward Pass
/////////////////////////////////////////////////////////////////////////////////////
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9],
CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel){
CellContainerClass currentCells(currentCellsPtr, currentCellsSize);
CellContainerClass subCellGroups[9];
for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
subCellGroups[idx].reset(subCellGroupsPtr[idx], subCellGroupsSize[idx]);
}
FCudaAssertLF(nbSubCellGroups != 0);
const MortonIndex blockStartIdx = FCudaMax(currentCells.getStartingIndex(),
subCellGroups[0].getStartingIndex()>>3);
const MortonIndex blockEndIdx = FCudaMin(currentCells.getEndingIndex(),
((subCellGroups[nbSubCellGroups-1].getEndingIndex()-1)>>3)+1);
int idxSubCellGroup = 0;
for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx && idxSubCellGroup != nbSubCellGroups; ++mindex){
unsigned char* cell = currentCells.getCell(mindex);
if(cell){
FCudaAssertLF(kernel->getMortonIndex(cell) == mindex);
unsigned char* child[8] = {nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr};
for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
if( subCellGroups[idxSubCellGroup].getEndingIndex() <= ((mindex<<3)+idxChild) ){
idxSubCellGroup += 1;
}
if( idxSubCellGroup == nbSubCellGroups ){
break;
}
child[idxChild] = subCellGroups[idxSubCellGroup].getCell((mindex<<3)+idxChild);
FCudaAssertLF(child[idxChild] == nullptr || kernel->getMortonIndex(child[idxChild]) == ((mindex<<3)+idxChild));
}
kernel->M2M(cell, child, idxLevel);
}
}
}
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* subCellGroupsPtr[9], std::size_t subCellGroupsSize[9],
CudaKernelClass* kernel, int nbSubCellGroups, int idxLevel){
FCuda__upwardPassPerform
< CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
<<<nbCudaThreads, nbCudaBlocks, 0/*starpu_cuda_get_local_stream()*/>>>
(currentCellsPtr, currentCellsSize,
subCellGroupsPtr, subCellGroupsSize,
kernel, nbSubCellGroups, idxLevel);
FCudaCheckAfterCall();
}
/////////////////////////////////////////////////////////////////////////////////////
/// Transfer Pass Mpi
/////////////////////////////////////////////////////////////////////////////////////
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__global__ void FCuda__transferInoutPassPerformMpi(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
CudaKernelClass* kernel, int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions){
CellContainerClass currentCells(currentCellsPtr, currentCellsSize);
CellContainerClass cellsOther(externalCellsPtr, externalCellsSize);
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
unsigned char* interCell = cellsOther.getCell(outsideInteractions[outInterIdx].outIndex);
if(interCell){
FCudaAssertLF(kernel->getMortonIndex(interCell) == outsideInteractions[outInterIdx].outIndex);
unsigned char* cell = currentCells.getCell(outsideInteractions[outInterIdx].insideIndex);
FCudaAssertLF(cell);
FCudaAssertLF(kernel->getMortonIndex(cell) == outsideInteractions[outInterIdx].insideIndex);
const unsigned char* interactions[343];
memset(interactions, 0, 343*sizeof(unsigned char*));
interactions[outsideInteractions[outInterIdx].outPosition] = interCell;
const int counter = 1;
kernel->M2L( cell , interactions, counter, idxLevel);
}
}
}
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__host__ void FCuda__transferInoutPassCallbackMpi(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
CudaKernelClass* kernel, int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions){
OutOfBlockInteraction* cuOutsideInteractions;
FCudaCheck( cudaMalloc(&cuOutsideInteractions,nbOutsideInteractions*sizeof(OutOfBlockInteraction)) );
FCudaCheck( cudaMemcpy( cuOutsideInteractions, outsideInteractions, nbOutsideInteractions*sizeof(OutOfBlockInteraction),
cudaMemcpyHostToDevice ) );
FCuda__transferInoutPassPerformMpi
< CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
<<<nbCudaThreads, nbCudaBlocks, 0/*starpu_cuda_get_local_stream()*/>>>(currentCellsPtr, currentCellsSize,
externalCellsPtr, externalCellsSize,
kernel, idxLevel, cuOutsideInteractions,
nbOutsideInteractions);
FCudaCheckAfterCall();
FCudaCheck(cudaFree(cuOutsideInteractions));
}
/////////////////////////////////////////////////////////////////////////////////////
/// Transfer Pass
/////////////////////////////////////////////////////////////////////////////////////
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
CudaKernelClass* kernel, int idxLevel){
CellContainerClass currentCells(currentCellsPtr, currentCellsSize);
const MortonIndex blockStartIdx = currentCells.getStartingIndex();
const MortonIndex blockEndIdx = currentCells.getEndingIndex();
for(MortonIndex mindex = blockStartIdx ; mindex < blockEndIdx ; ++mindex){
unsigned char* cell = currentCells.getCell(mindex);
if(cell){
FCudaAssertLF(kernel->getMortonIndex(cell) == mindex);
MortonIndex interactionsIndexes[189];
int interactionsPosition[189];
const int3 coord = (kernel->getCoordinate(cell));
int counter = FCudaTreeCoordinate::GetInteractionNeighbors(coord, idxLevel,interactionsIndexes,interactionsPosition);
const unsigned char* interactions[343];
memset(interactions, 0, 343*sizeof(unsigned char*));
int counterExistingCell = 0;
for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){
unsigned char* interCell = currentCells.getCell(interactionsIndexes[idxInter]);
if(interCell){
FCudaAssertLF(kernel->getMortonIndex(interCell) == interactionsIndexes[idxInter]);
FCudaAssertLF(interactions[interactionsPosition[idxInter]] == nullptr);
interactions[interactionsPosition[idxInter]] = interCell;
counterExistingCell += 1;
}
}
}
kernel->M2L( cell , interactions, counterExistingCell, idxLevel);
}
}
}
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__host__ void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
CudaKernelClass* kernel, int idxLevel){
FCuda__transferInPassPerform
< CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
<<<nbCudaThreads, nbCudaBlocks, 0/*starpu_cuda_get_local_stream()*/>>>(currentCellsPtr, currentCellsSize,
kernel, idxLevel);
FCudaCheckAfterCall();
}
template <class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
__global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
CudaKernelClass* kernel, int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions){
CellContainerClass currentCells(currentCellsPtr, currentCellsSize);
CellContainerClass cellsOther(externalCellsPtr, externalCellsSize);
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
unsigned char* interCell = cellsOther.getCell(outsideInteractions[outInterIdx].outIndex);
if(interCell){
FCudaAssertLF(kernel->getMortonIndex(interCell) == outsideInteractions[outInterIdx].outIndex);
unsigned char* cell = currentCells.getCell(outsideInteractions[outInterIdx].insideIndex);
FCudaAssertLF(cell);
FCudaAssertLF(kernel->getMortonIndex(cell) == outsideInteractions[outInterIdx].insideIndex);
const unsigned char* interactions[343];
memset(interactions, 0, 343*sizeof(unsigned char*));
interactions[outsideInteractions[outInterIdx].outPosition] = interCell;
const int counter = 1;
kernel->M2L( cell , interactions, counter, idxLevel);
interactions[outsideInteractions[outInterIdx].outPosition] = nullptr;
interactions[FMGetOppositeInterIndex(outsideInteractions[outInterIdx].outPosition)] = cell;
kernel->M2L( interCell , interactions, counter, idxLevel);
}
}