Attention une mise à jour du service Gitlab va être effectuée le mardi 18 janvier (et non lundi 17 comme annoncé précédemment) entre 18h00 et 18h30. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes.

Commit a9c49f3c authored by BRAMAS Berenger's avatar BRAMAS Berenger
Browse files

update cuda version

parent 90d5dd91
......@@ -175,16 +175,13 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
__global__ void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
const MortonIndex blockStartIdx = currentCells.getStartingIndex();
const MortonIndex blockEndIdx = currentCells.getEndingIndex();
for(int cellIdx = 0 ; cellIdx < currentCells.getNumberOfCellsInBlock() ; ++cellIdx){
for(int cellIdx = blockIdx.x ; cellIdx < currentCells.getNumberOfCellsInBlock() ; cellIdx += gridDim.x){
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx);
MortonIndex interactionsIndexes[189];
......@@ -238,32 +235,33 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions,
const int* safeInteractions, int nbSafeInteractions, CudaKernelClass* kernel){
if(blockIdx.x != 0){
return;
}
CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr);
if(mode == 1){
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
}
}
}
else{
for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
for(int cellIdx = blockIdx.x ; cellIdx < nbSafeInteractions ; cellIdx += gridDim.x){
for(int outInterIdx = safeInteractions[cellIdx] ; outInterIdx < safeInteractions[cellIdx+1] ; ++outInterIdx){
typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
}
}
}
}
......
......@@ -24,6 +24,7 @@ class FStarPUFmmPriorities{
int insertionPositionM2L;
int insertionPositionM2LExtern;
int insertionPositionM2LLastLevel;
int insertionPositionM2LExternLastLevel;
int insertionPositionL2L;
int insertionPositionL2P;
int insertionPositionP2P;
......@@ -90,13 +91,16 @@ public:
insertionPositionM2LLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L last " << insertionPositionM2LLastLevel << "\n" );
insertionPositionM2LExternLastLevel = incPrio++;
FLOG( FLog::Controller << "\t M2L extern last " << insertionPositionM2LExternLastLevel << "\n" );
insertionPositionL2P = incPrio++;
FLOG( FLog::Controller << "\t L2P " << insertionPositionL2P << "\n" );
insertionPositionP2PExtern = incPrio++;
FLOG( FLog::Controller << "\t P2P Outer " << insertionPositionP2PExtern << "\n" );
assert(incPrio == 8 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
assert(incPrio == 9 + (treeHeight-3) + (treeHeight-3) + (treeHeight-3));
}
else{
int incPrio = 0;
......@@ -110,6 +114,7 @@ public:
insertionPositionM2L = -1;
insertionPositionM2LExtern = -1;
insertionPositionM2LLastLevel = -1;
insertionPositionM2LExternLastLevel = -1;
insertionPositionL2L = -1;
......@@ -158,7 +163,8 @@ public:
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if(capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevelExtern = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CPU prio M2L extern " << cpuCountPrio << " bucket " << prioM2LAtLevelExtern << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevelExtern;
......@@ -186,6 +192,12 @@ public:
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CPU_IDX)){
const int prioM2LAtLevel = getInsertionPosM2LExtern(treeHeight-1);
FLOG( FLog::Controller << "\t CPU prio M2L " << cpuCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CPU;
}
if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CPU_IDX)){
FLOG( FLog::Controller << "\t CPU prio L2P " << cpuCountPrio << " bucket " << insertionPositionL2P << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = insertionPositionL2P;
......@@ -291,13 +303,28 @@ public:
if(!workOnlyOnLeaves && capacities->supportM2L(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LAtLevel = getInsertionPosM2L(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
FLOG( FLog::Controller << "\t CUDA prio M2L " << cudaCountPrio << " bucket " << prioM2LAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LAtLevel;
heteroprio->buckets[prioM2LAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 10.0f;
heteroprio->buckets[prioM2LAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 15.0f;
}
#endif
}
}
if(!workOnlyOnLeaves && capacities->supportM2LExtern(FSTARPU_CUDA_IDX)){
for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){
const int prioM2LExternAtLevel = getInsertionPosM2LExtern(idxLevel);
FLOG( FLog::Controller << "\t CUDA prio M2L ex " << cudaCountPrio << " bucket " << prioM2LExternAtLevel << "\n" );
heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][cudaCountPrio++] = prioM2LExternAtLevel;
heteroprio->buckets[prioM2LExternAtLevel].valide_archs |= STARPU_CUDA;
heteroprio->buckets[prioM2LExternAtLevel].factor_base_arch_index = FSTARPU_CUDA_IDX;
#ifdef STARPU_USE_CPU
if(capacities->supportM2L(FSTARPU_CUDA_IDX)){
heteroprio->buckets[prioM2LExternAtLevel].slow_factors_per_index[FSTARPU_CPU_IDX] = 5.0f;
}
#endif
}
......@@ -360,7 +387,7 @@ public:
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2L + (inLevel - 2)*3);
}
int getInsertionPosM2LExtern(const int inLevel) const {
return (inLevel==treeHeight-1? insertionPositionM2LLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
return (inLevel==treeHeight-1? insertionPositionM2LExternLastLevel : insertionPositionM2LExtern + (inLevel - 2)*3);
}
int getInsertionPosL2L(const int inLevel) const {
return insertionPositionL2L + (inLevel - 2)*3;
......
......@@ -118,6 +118,35 @@ public:
}
};
template <class BaseClass>
class FStarPUCudaP2PM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
return inPu == FSTARPU_CPU_IDX;
}
public:
using BaseClass::BaseClass;
bool supportP2P(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportP2PMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2L(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LExtern(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
bool supportM2LMpi(const FStarPUTypes inPu) const override {
return inPu == FSTARPU_CPU_IDX || inPu == FSTARPU_CUDA_IDX;
}
};
template <class BaseClass>
class FStarPUCudaM2LCapacities : public BaseClass, public FStarPUAbstractCapacities {
bool check(const FStarPUTypes inPu) const override{
......
......@@ -23,8 +23,8 @@ public:
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FBasicCellPOD,FCudaUnifCellPODPole<FReal,ORDER>,FCudaUnifCellPODLocal<FReal,ORDER> > CellClass;
static const int NB_THREAD_GROUPS = 1;// TODO 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 1;// TODO 256;
static const int NB_THREAD_GROUPS = 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 256;
static const int SHARED_MEMORY_SIZE = 512;// 49152
FUnifCudaSharedData<FReal,ORDER> data;
......@@ -52,7 +52,7 @@ public:
FCudaUnifComplex<FReal> *const __restrict__ FX) const
{
// Perform entrywise product manually
for (unsigned int j = 0 ; j < data.opt_rc; ++j){
for (unsigned int j = threadIdx.x ; j < data.opt_rc; j += blockDim.x){
FCudaUnifComplex<FReal> FC_scale;
//FComplex<FReal>(scale*FC[idx*opt_rc + j].getReal(), scale*FC[idx*opt_rc + j].getImag()),
FC_scale.complex[0] = scale*data.FC[idx*data.opt_rc + j].complex[0];
......@@ -379,7 +379,6 @@ public:
template <class FReal, int ORDER>
void FUnifCudaFillObject(void* cudaKernel, const FUnifCudaSharedData<FReal,ORDER>& hostData){
printf("FUnifCudaFillObject cudaKernel %p\n", cudaKernel);
FUnifCudaSharedData<FReal,ORDER>* cudaData = &((FUnifCuda<FReal,ORDER>*)cudaKernel)->data;
FCudaCheck( cudaMemcpy( cudaData, &hostData, sizeof(FUnifCudaSharedData<FReal,ORDER>),
cudaMemcpyHostToDevice ) );
......
......@@ -79,7 +79,9 @@ int main(int argc, char* argv[]){
typedef FP2PGroupParticleContainer<FReal> GroupContainerClass;
typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal> GroupOctreeClass;
typedef FStarPUCudaM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
//typedef FStarPUCudaM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
//typedef FStarPUCudaP2PCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
typedef FStarPUCudaP2PM2LCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
typedef FStarPUCudaWrapper<GroupKernelClass,
......@@ -161,7 +163,7 @@ int main(int argc, char* argv[]){
std::cout << "Kernel executed in in " << timer.tacAndElapsed() << "s\n";
// Validate the result
if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == true){
if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == false){
FSize offsetParticles = 0;
FReal*const allPhysicalValues = allParticles.getPhysicalValues();
FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]);
......
......@@ -60,7 +60,7 @@ int main(int argc, char* argv[]){
// Initialize the types
typedef double FReal;
static const int ORDER = 6;
static const int ORDER = 5;
typedef FInterpMatrixKernelR<FReal> MatrixKernelClass;
typedef FUnifCellPODCore GroupCellSymbClass;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment