Commit a9c49f3c authored by BRAMAS Berenger's avatar BRAMAS Berenger

update cuda version

parent 90d5dd91
......@@ -175,16 +175,13 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
__global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
const MortonIndex blockStartIdx = currentCells.getStartingIndex();
const MortonIndex blockEndIdx = currentCells.getEndingIndex();
for(int cellIdx = 0 ; cellIdx < currentCells.getNumberOfCellsInBlock() ; ++cellIdx){
for(int cellIdx = blockIdx.x ; cellIdx < currentCells.getNumberOfCellsInBlock() ; cellIdx += gridDim.x){
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx);
MortonIndex interactionsIndexes[189];
......@@ -238,32 +235,33 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr);
if(mode == 1){
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
}
}
}
else{
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
}
}
}
}
......
......@@ -24,6 +24,7 @@ class FStarPUFmmPriorities{
int insertionPositionM2L;
int insertionPositionM2LExtern;
int insertionPositionM2LLastLevel;
int insertionPositionM2LExternLastLevel;
int insertionPositionL2L;
int insertionPositionL2P;
int insertionPositionP2P;
......@@ -90,13 +91,16 @@ public:
insertionPositionM2LLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L last " << insertionPositionM2LLastLevel << "\n" );
insertionPositionM2LExternLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L extern last " << insertionPositionM2LExternLastLevel << "\n" );
insertionPositionL2P = incPrio++;
FLOG( FLog::Controller << "\t L2P " << insertionPositionL2P << "\n" );
insertionPositionP2PExtern = incPrio++;
FLOG( FLog::Controller << "\t P2P Outer " << insertionPositionP2PExtern << "\n" );
assert(incPrio == 8 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
assert(incPrio == 9 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
}
else{
int incPrio = 0;
......@@ -110,6 +114,7 @@ public:
insertionPositionM2L = -1;
insertionPositionM2LExtern = -1;
insertionPositionM2LLastLevel = -1;
insertionPositionM2LExternLastLevel = -1;
insertionPositionL2L = -1;
......@@ -158,7 +163,8 @@ public:
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if(capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevelExtern = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CPU prio M2L extern " << cpuCountPrio << " bucket " << prioM2LAtLevelExtern << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevelExtern;
......@@ -186,6 +192,12 @@ public:
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevel = getInsertionPosM2LExtern(treeHeight-1);
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CPU_IDX)){
FLOG( FLog::Controller << "\t CPU prio L2P " << cpuCountPrio << " bucket " << insertionPositionL2P << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = insertionPositionL2P;
......@@ -291,13 +303,28 @@ public:
if(!workOnlyOnLeaves && capacities->supportM2L(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LAtLevel = getInsertionPosM2L(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
FLOG( FLog::Controller << "\t CUDA prio M2L " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 10.0f;
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 15.0f;
}
#endif
}
}
if(!workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LExternAtLevel = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LExternAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LExternAtLevel;
heteroprio->buckets[prioM2LExternAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LExternAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LExternAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 5.0f;
}
#endif
}
......@@ -360,7 +387,7 @@ public:
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2L + (inLevel - 2)*3);
}
int getInsertionPosM2LExtern(const int inLevel) const {
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
return (inLevel==treeHeight-1? insertionPositionM2LExternLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
}
int getInsertionPosL2L(const int inLevel) const {
return insertionPositionL2L + (inLevel - 2)*3;
......
......@@ -118,6 +118,35 @@ public:
}
};
template <class BaseClass>
class FStarPUCudaP2PM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
return inPu == FSTARPU_CPU_IDX;
}
public:
using BaseClass::BaseClass;
bool supportP2P(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2L(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
};
template <class BaseClass>
class FStarPUCudaM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
......
......@@ -23,8 +23,8 @@ public:
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FBasicCellPOD,FCudaUnifCellPODPole<FReal,ORDER>,FCudaUnifCellPODLocal<FReal,ORDER> > CellClass;
static const int NB_THREAD_GROUPS = 1;// TODO 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 1;// TODO 256;
static const int NB_THREAD_GROUPS = 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 256;
static const int SHARED_MEMORY_SIZE = 512;// 49152
FUnifCudaSharedData<FReal,ORDER> data;
......@@ -52,7 +52,7 @@ public:
FCudaUnifComplex<FReal> *const __restrict__ FX) const
{
// Perform entrywise product manually
for (unsigned int j = 0 ; j < data.opt_rc; ++j){
for (unsigned int j = threadIdx.x ; j < data.opt_rc; j += blockDim.x){
FCudaUnifComplex<FReal> FC_scale;
//FComplex<FReal>(scale*FC[idx*opt_rc + j].getReal(), scale*FC[idx*opt_rc + j].getImag()),
FC_scale.complex[0] = scale*data.FC[idx*data.opt_rc + j].complex[0];
......@@ -379,7 +379,6 @@ public:
template <class FReal, int ORDER>
void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<FReal,ORDER>& hostData){
printf("FUnifCudaFillObject cudaKernel %p\n", cudaKernel);
FUnifCudaSharedData<FReal,ORDER>* cudaData = &((FUnifCuda<FReal,ORDER>*)cudaKernel)->data;
FCudaCheck( cudaMemcpy( cudaData, &hostData, sizeof(FUnifCudaSharedData<FReal,ORDER>),
cudaMemcpyHostToDevice ) );
......
......@@ -79,7 +79,9 @@ int main(int argc, char* argv[]){
typedef FP2PGroupParticleContainer<FReal> GroupContainerClass;
typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal> GroupOctreeClass;
typedef FStarPUCudaM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
//typedef FStarPUCudaM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
//typedef FStarPUCudaP2PCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
typedef FStarPUCudaP2PM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
typedef FStarPUCudaWrapper<GroupKernelClass,
......@@ -161,7 +163,7 @@ int main(int argc, char* argv[]){
std::cout << "Kernel executed in in " << timer.tacAndElapsed() << "s\n";
// Validate the result
if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == true){
if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == false){
FSize offsetParticles = 0;
FReal*const allPhysicalValues = allParticles.getPhysicalValues();
FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]);
......
......@@ -60,7 +60,7 @@ int main(int argc, char* argv[]){
// Initialize the types
typedef double FReal;
static const int ORDER = 6;
static const int ORDER = 5;
typedef FInterpMatrixKernelR<FReal> MatrixKernelClass;
typedef FUnifCellPODCore GroupCellSymbClass;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment