Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
solverstack
ScalFMM
Commits
b844af75
Commit
b844af75
authored
Jan 21, 2016
by
BRAMAS Berenger
Browse files
make the P2P cuda working for several thread group
parent
f0494086
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
View file @
b844af75
This diff is collapsed.
Click to expand it.
Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
View file @
b844af75
...
...
@@ -63,7 +63,8 @@ void FCuda__directInoutPassCallbackMpi(
unsigned
char
*
containersPtr
,
std
::
size_t
containersSize
,
unsigned
char
*
containersDownPtr
,
unsigned
char
*
externalContainersPtr
,
std
::
size_t
externalContainersSize
,
const
OutOfBlockInteraction
*
outsideInteractions
,
int
nbOutsideInteractions
,
const
int
treeHeight
,
CudaKernelClass
*
kernel
,
cudaStream_t
currentStream
,
int
nbOutsideInteractions
,
const
int
safeOuterInteractions
[],
const
int
counterOuterCell
,
const
int
treeHeight
,
CudaKernelClass
*
kernel
,
cudaStream_t
currentStream
,
const
dim3
inGridSize
,
const
dim3
inBlocksSize
);
#endif
template
<
class
SymboleCellClass
,
class
PoleCellClass
,
class
LocalCellClass
,
...
...
@@ -78,8 +79,11 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
void
FCuda__directInoutPassCallback
(
unsigned
char
*
containersPtr
,
std
::
size_t
containersSize
,
unsigned
char
*
containersDownPtr
,
unsigned
char
*
externalContainersPtr
,
std
::
size_t
externalContainersSize
,
unsigned
char
*
externalContainersDownPtr
,
const
OutOfBlockInteraction
*
outsideInteractions
,
int
nbOutsideInteractions
,
const
int
treeHeight
,
CudaKernelClass
*
kernel
,
cudaStream_t
currentStream
,
const
OutOfBlockInteraction
*
outsideInteractions
,
int
nbOutsideInteractions
,
const
int
safeOuterInteractions
[],
const
int
counterOuterCell
,
const
OutOfBlockInteraction
*
insideInteractions
,
const
int
safeInnterInteractions
[],
const
int
counterInnerCell
,
const
int
treeHeight
,
CudaKernelClass
*
kernel
,
cudaStream_t
currentStream
,
const
dim3
inGridSize
,
const
dim3
inBlocksSize
);
template
<
class
SymboleCellClass
,
class
PoleCellClass
,
class
LocalCellClass
,
...
...
Src/GroupTree/P2P/FCudaP2P.hpp
View file @
b844af75
...
...
@@ -52,7 +52,7 @@ public:
typedef
FCudaGroupAttachedLeaf
<
FReal
,
1
,
4
,
FReal
>
ContainerClass
;
typedef
FCudaCompositeCell
<
FCudaEmptyCellSymb
,
int
,
int
>
CellClass
;
static
const
int
SHARE
_SIZE
=
1
;
//128;
static
const
int
THREAD_GROUP
_SIZE
=
1
;
//128;
__device__
void
P2M
(
CellClass
/*pole*/
,
const
ContainerClass
*
const
/*particles*/
)
{
}
...
...
@@ -90,13 +90,13 @@ public:
targetZ
=
(
threadCompute
?
targets
->
getPositions
()[
2
][
idxPart
]
:
0
);
targetPhys
=
(
threadCompute
?
targets
->
getAttribute
(
0
)[
idxPart
]
:
0
);
for
(
int
idxCopy
=
0
;
idxCopy
<
targets
->
getNbParticles
()
;
idxCopy
+=
SHARE
_SIZE
){
__shared__
FReal
sourcesX
[
SHARE
_SIZE
];
__shared__
FReal
sourcesY
[
SHARE
_SIZE
];
__shared__
FReal
sourcesZ
[
SHARE
_SIZE
];
__shared__
FReal
sourcesPhys
[
SHARE
_SIZE
];
for
(
int
idxCopy
=
0
;
idxCopy
<
targets
->
getNbParticles
()
;
idxCopy
+=
THREAD_GROUP
_SIZE
){
__shared__
FReal
sourcesX
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesY
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesZ
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesPhys
[
THREAD_GROUP
_SIZE
];
const
int
nbCopies
=
Min
(
SHARE
_SIZE
,
targets
->
getNbParticles
()
-
idxCopy
);
const
int
nbCopies
=
Min
(
THREAD_GROUP
_SIZE
,
targets
->
getNbParticles
()
-
idxCopy
);
if
(
threadIdx
.
x
<
nbCopies
){
sourcesX
[
threadIdx
.
x
]
=
targets
->
getPositions
()[
0
][
threadIdx
.
x
+
idxCopy
];
sourcesY
[
threadIdx
.
x
]
=
targets
->
getPositions
()[
1
][
threadIdx
.
x
+
idxCopy
];
...
...
@@ -187,13 +187,13 @@ public:
targetZ
=
(
threadCompute
?
targets
->
getPositions
()[
2
][
idxPart
]
:
0
);
targetPhys
=
(
threadCompute
?
targets
->
getAttribute
(
0
)[
idxPart
]
:
0
);
for
(
int
idxCopy
=
0
;
idxCopy
<
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
;
idxCopy
+=
SHARE
_SIZE
){
__shared__
FReal
sourcesX
[
SHARE
_SIZE
];
__shared__
FReal
sourcesY
[
SHARE
_SIZE
];
__shared__
FReal
sourcesZ
[
SHARE
_SIZE
];
__shared__
FReal
sourcesPhys
[
SHARE
_SIZE
];
for
(
int
idxCopy
=
0
;
idxCopy
<
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
;
idxCopy
+=
THREAD_GROUP
_SIZE
){
__shared__
FReal
sourcesX
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesY
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesZ
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesPhys
[
THREAD_GROUP
_SIZE
];
const
int
nbCopies
=
Min
(
SHARE
_SIZE
,
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
-
idxCopy
);
const
int
nbCopies
=
Min
(
THREAD_GROUP
_SIZE
,
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
-
idxCopy
);
if
(
threadIdx
.
x
<
nbCopies
){
sourcesX
[
threadIdx
.
x
]
=
directNeighborsParticles
[
idxNeigh
].
getPositions
()[
0
][
threadIdx
.
x
+
idxCopy
];
sourcesY
[
threadIdx
.
x
]
=
directNeighborsParticles
[
idxNeigh
].
getPositions
()[
1
][
threadIdx
.
x
+
idxCopy
];
...
...
@@ -259,13 +259,13 @@ public:
targetZ
=
(
threadCompute
?
targets
->
getPositions
()[
2
][
idxPart
]
:
0
);
targetPhys
=
(
threadCompute
?
targets
->
getAttribute
(
0
)[
idxPart
]
:
0
);
for
(
int
idxCopy
=
0
;
idxCopy
<
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
;
idxCopy
+=
SHARE
_SIZE
){
__shared__
FReal
sourcesX
[
SHARE
_SIZE
];
__shared__
FReal
sourcesY
[
SHARE
_SIZE
];
__shared__
FReal
sourcesZ
[
SHARE
_SIZE
];
__shared__
FReal
sourcesPhys
[
SHARE
_SIZE
];
for
(
int
idxCopy
=
0
;
idxCopy
<
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
;
idxCopy
+=
THREAD_GROUP
_SIZE
){
__shared__
FReal
sourcesX
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesY
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesZ
[
THREAD_GROUP
_SIZE
];
__shared__
FReal
sourcesPhys
[
THREAD_GROUP
_SIZE
];
const
int
nbCopies
=
Min
(
SHARE
_SIZE
,
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
-
idxCopy
);
const
int
nbCopies
=
Min
(
THREAD_GROUP
_SIZE
,
directNeighborsParticles
[
idxNeigh
].
getNbParticles
()
-
idxCopy
);
if
(
threadIdx
.
x
<
nbCopies
){
sourcesX
[
threadIdx
.
x
]
=
directNeighborsParticles
[
idxNeigh
].
getPositions
()[
0
][
threadIdx
.
x
+
idxCopy
];
sourcesY
[
threadIdx
.
x
]
=
directNeighborsParticles
[
idxNeigh
].
getPositions
()[
1
][
threadIdx
.
x
+
idxCopy
];
...
...
@@ -321,12 +321,12 @@ public:
// nothing to do
}
__host__
static
dim3
GetGridSize
(
const
int
intervalSize
){
return
1
;
//intervalSize
;
__host__
static
dim3
GetGridSize
(
const
int
/*
intervalSize
*/
){
return
24
;
}
__host__
static
dim3
GetBlocksSize
(){
return
SHARE
_SIZE
;
return
THREAD_GROUP
_SIZE
;
}
};
...
...
Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
View file @
b844af75
...
...
@@ -218,9 +218,13 @@ public:
const
std
::
vector
<
OutOfBlockInteraction
>*
outsideInteractions
=
nullptr
;
int
intervalSize
=
0
;
starpu_codelet_unpack_args
(
cl_arg
,
&
worker
,
&
outsideInteractions
,
&
intervalSize
);
const
int
nbInteractions
=
int
(
outsideInteractions
->
size
());
CudaKernelClass
*
kernel
=
worker
->
get
<
ThisClass
>
(
FSTARPU_CPU_IDX
)
->
kernels
[
starpu_worker_get_id
()];
std
::
unique_ptr
<
int
[]
>
safeOuterInteractions
(
new
int
[
nbInteractions
+
1
]);
const
int
counterOuterCell
=
GetClusterOfInteractionsOutside
(
safeOuterInteractions
.
get
(),
outsideInteractions
->
data
(),
nbInteractions
);
FCuda__directInoutPassCallbackMpi
<
SymboleCellClass
,
PoleCellClass
,
LocalCellClass
,
CudaCellGroupClass
,
CudaParticleGroupClass
,
CudaParticleContainerClass
,
CudaKernelClass
>
(
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
0
]),
...
...
@@ -228,7 +232,8 @@ public:
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
1
]),
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
2
]),
STARPU_VARIABLE_GET_ELEMSIZE
(
buffers
[
2
]),
outsideInteractions
->
data
(),
outsideInteractions
->
size
(),
outsideInteractions
->
data
(),
nbInteractions
,
safeOuterInteractions
.
get
(),
counterOuterCell
,
worker
->
get
<
ThisClass
>
(
FSTARPU_CPU_IDX
)
->
treeHeight
,
kernel
,
starpu_cuda_get_local_stream
(),
FCuda__GetGridSize
(
kernel
,
intervalSize
),
FCuda__GetBlockSize
(
kernel
));
}
...
...
@@ -252,14 +257,76 @@ public:
FCuda__GetGridSize
(
kernel
,
intervalSize
),
FCuda__GetBlockSize
(
kernel
));
}
static
int
GetClusterOfInteractionsInside
(
int
safeOuterInteractions
[],
const
OutOfBlockInteraction
outsideInteractions
[],
const
int
nbInteractions
){
safeOuterInteractions
[
0
]
=
0
;
safeOuterInteractions
[
1
]
=
0
;
int
counterInnerCell
=
1
;
for
(
int
idxInter
=
0
;
idxInter
<
int
(
nbInteractions
)
;
++
idxInter
){
if
(
outsideInteractions
[
safeOuterInteractions
[
counterInnerCell
]].
insideIdxInBlock
!=
outsideInteractions
[
idxInter
].
insideIdxInBlock
){
FAssertLF
(
outsideInteractions
[
safeOuterInteractions
[
counterInnerCell
]].
insideIdxInBlock
<
outsideInteractions
[
idxInter
].
insideIdxInBlock
);
counterInnerCell
+=
1
;
safeOuterInteractions
[
counterInnerCell
]
=
safeOuterInteractions
[
counterInnerCell
-
1
];
}
else
{
safeOuterInteractions
[
counterInnerCell
]
+=
1
;
}
}
FAssertLF
(
safeOuterInteractions
[
counterInnerCell
]
==
nbInteractions
);
return
counterInnerCell
;
}
static
int
GetClusterOfInteractionsOutside
(
int
safeOuterInteractions
[],
const
OutOfBlockInteraction
outsideInteractions
[],
const
int
nbInteractions
){
safeOuterInteractions
[
0
]
=
0
;
safeOuterInteractions
[
1
]
=
0
;
int
counterInnerCell
=
1
;
for
(
int
idxInter
=
0
;
idxInter
<
int
(
nbInteractions
)
;
++
idxInter
){
if
(
outsideInteractions
[
safeOuterInteractions
[
counterInnerCell
]].
outsideIdxInBlock
!=
outsideInteractions
[
idxInter
].
outsideIdxInBlock
){
FAssertLF
(
outsideInteractions
[
safeOuterInteractions
[
counterInnerCell
]].
outsideIdxInBlock
<
outsideInteractions
[
idxInter
].
outsideIdxInBlock
);
counterInnerCell
+=
1
;
safeOuterInteractions
[
counterInnerCell
]
=
safeOuterInteractions
[
counterInnerCell
-
1
];
}
else
{
safeOuterInteractions
[
counterInnerCell
]
+=
1
;
}
}
FAssertLF
(
safeOuterInteractions
[
counterInnerCell
]
==
nbInteractions
);
return
counterInnerCell
;
}
static
void
directInoutPassCallback
(
void
*
buffers
[],
void
*
cl_arg
){
FStarPUPtrInterface
*
worker
=
nullptr
;
const
std
::
vector
<
OutOfBlockInteraction
>*
outsideInteractions
=
nullptr
;
int
intervalSize
=
0
;
starpu_codelet_unpack_args
(
cl_arg
,
&
worker
,
&
outsideInteractions
,
&
intervalSize
);
const
int
nbInteractions
=
int
(
outsideInteractions
->
size
());
CudaKernelClass
*
kernel
=
worker
->
get
<
ThisClass
>
(
FSTARPU_CPU_IDX
)
->
kernels
[
starpu_worker_get_id
()];
// outsideInteractions is sorted following the outIndex
// Compute the cell interval
std
::
unique_ptr
<
int
[]
>
safeOuterInteractions
(
new
int
[
nbInteractions
+
1
]);
const
int
counterOuterCell
=
GetClusterOfInteractionsOutside
(
safeOuterInteractions
.
get
(),
outsideInteractions
->
data
(),
nbInteractions
);
std
::
unique_ptr
<
OutOfBlockInteraction
[]
>
insideInteractions
(
new
OutOfBlockInteraction
[
nbInteractions
]);
memcpy
(
insideInteractions
.
get
(),
outsideInteractions
->
data
(),
nbInteractions
*
sizeof
(
OutOfBlockInteraction
));
FQuickSort
<
OutOfBlockInteraction
>::
QsSequential
(
insideInteractions
.
get
(),
nbInteractions
,
[](
const
OutOfBlockInteraction
&
inter1
,
const
OutOfBlockInteraction
&
inter2
){
// Could be insideIndex since the block are in morton order
return
inter1
.
insideIdxInBlock
<=
inter2
.
insideIdxInBlock
;
});
std
::
unique_ptr
<
int
[]
>
safeInnterInteractions
(
new
int
[
nbInteractions
+
1
]);
const
int
counterInnerCell
=
GetClusterOfInteractionsInside
(
safeInnterInteractions
.
get
(),
insideInteractions
.
get
(),
nbInteractions
);
FCuda__directInoutPassCallback
<
SymboleCellClass
,
PoleCellClass
,
LocalCellClass
,
CudaCellGroupClass
,
CudaParticleGroupClass
,
CudaParticleContainerClass
,
CudaKernelClass
>
(
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
0
]),
...
...
@@ -268,7 +335,11 @@ public:
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
2
]),
STARPU_VARIABLE_GET_ELEMSIZE
(
buffers
[
2
]),
(
unsigned
char
*
)
STARPU_VARIABLE_GET_PTR
(
buffers
[
3
]),
outsideInteractions
->
data
(),
int
(
outsideInteractions
->
size
()),
worker
->
get
<
ThisClass
>
(
FSTARPU_CPU_IDX
)
->
treeHeight
,
outsideInteractions
->
data
(),
nbInteractions
,
safeOuterInteractions
.
get
(),
counterOuterCell
,
insideInteractions
.
get
(),
safeInnterInteractions
.
get
(),
counterInnerCell
,
worker
->
get
<
ThisClass
>
(
FSTARPU_CPU_IDX
)
->
treeHeight
,
kernel
,
starpu_cuda_get_local_stream
(),
FCuda__GetGridSize
(
kernel
,
intervalSize
),
FCuda__GetBlockSize
(
kernel
));
}
...
...
Tests/GroupTree/testBlockedRotationCuda.cpp
View file @
b844af75
...
...
@@ -39,6 +39,7 @@
#include
"../../Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp"
#include
"../../Src/GroupTree/Cuda/FCudaGroupOfCells.hpp"
#include
"../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp"
#include
"../../Src/Utils/FParameterNames.hpp"
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment