Commit d0b05583 authored by BRAMAS Berenger's avatar BRAMAS Berenger

Work in progress for the use of opencl with starpu

parent f7829bfb
......@@ -293,7 +293,7 @@ if( ScalFMM_USE_STARPU )
MESSAGE(STATUS " STARPU_INCLUDES = ${STARPU_INCLUDES}")
OPTION( ScalFMM_USE_CUDA "Set to ON to use CUDA with StarPU" OFF )
MESSAGE( STATUS "ScalFMM_USE_CUDA = ${ScalFMM_USE_CUDA}" )
MESSAGE( STATUS "ScalFMM_USE_CUDA = ${ScalFMM_USE_CUDA}" )
if(ScalFMM_USE_CUDA)
execute_process(COMMAND nvcc --version ERROR_VARIABLE cuda_error_output OUTPUT_QUIET)
if(cuda_error_output)
......@@ -311,9 +311,17 @@ if( ScalFMM_USE_STARPU )
include_directories($ENV{CUDA_INC})
SET(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES}; -L$ENV{CUDA_LIB}; -lcudart")
endif()
OPTION( ScalFMM_USE_OPENCL "Set to ON to use OPENCL with StarPU" OFF )
MESSAGE( STATUS "ScalFMM_USE_OPENCL = ${ScalFMM_USE_OPENCL}" )
if(ScalFMM_USE_OPENCL)
include_directories($ENV{OPENCL_INC})
SET(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES}; -L$ENV{OPENCL_LIB}; -lOpenCL")
endif()
endif(ScalFMM_USE_STARPU)
list(APPEND FUSE_LIST "STARPU")
list(APPEND FUSE_LIST "CUDA")
list(APPEND FUSE_LIST "OPENCL")
#
##################################################################
# Use SSE #
......
// Keep in private GIT
// @SCALFMM_PRIVATE
#ifndef FGROUPTASKSTARPUALGORITHM_HPP
......@@ -19,9 +18,8 @@
#include <omp.h>
//extern "C"{
#include <starpu.h>
//}
#include "FStarPUUtils.hpp"
#ifdef STARPU_USE_CPU
#include "FStarPUCpuWrapper.hpp"
......@@ -33,16 +31,19 @@
#include "Cuda/FCudaGroupOfParticles.hpp"
#include "Cuda/FCudaGroupOfCells.hpp"
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_OPENCL_KERNEL
#include "FStarPUOpenClWrapper.hpp"
#include "OpenCl/FOpenCLDeviceWrapper.hpp"
#endif
#include "FStarPUUtils.hpp"
template <class OctreeClass, class CellContainerClass, class CellClass, class KernelClass, class ParticleGroupClass, class ParticleContainerClass
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
, class CudaCellContainerClass = FCudaGroupOfCells<0>, class CudaParticleGroupClass = FCudaGroupOfParticles<0, int>, class CudaParticleContainerClass = FCudaGroupAttachedLeaf<0, int>,
class CudaKernelClass = FCudaEmptyKernel<>
#endif
#ifdef ScalFMM_ENABLE_OPENCL_KERNEL
, class OpenCLDeviceWrapperClass = FOpenCLDeviceWrapper<KernelClass, nullptr>
#endif
>
class FGroupTaskStarPUAlgorithm {
......@@ -50,6 +51,9 @@ protected:
typedef FGroupTaskStarPUAlgorithm<OctreeClass, CellContainerClass, CellClass, KernelClass, ParticleGroupClass, ParticleContainerClass
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
, CudaCellContainerClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass
#endif
#ifdef ScalFMM_ENABLE_OPENCL_KERNEL
, OpenCLDeviceWrapperClass
#endif
> ThisClass;
......@@ -89,8 +93,8 @@ protected:
typedef FStarPUCudaWrapper<KernelClass, CudaCellContainerClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass> StarPUCudaWrapperClass;
StarPUCudaWrapperClass cudaWrapper;
#endif
#ifdef STARPU_USE_OPENCL
typedef FStarPUOpenClWrapper<CellContainerClass, CellClass, KernelClass, ParticleGroupClass, ParticleContainerClass> StarPUOpenClWrapperClass;
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
typedef FStarPUOpenClWrapper<KernelClass, OpenCLDeviceWrapperClass> StarPUOpenClWrapperClass;
StarPUOpenClWrapperClass openclWrapper;
#endif
......@@ -107,7 +111,7 @@ public:
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
cudaWrapper(tree->getHeight()),
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
openclWrapper(tree->getHeight()),
#endif
wrapperptr(&wrappers){
......@@ -138,7 +142,7 @@ public:
});
wrappers.set(FSTARPU_CUDA_IDX, &cudaWrapper);
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
FStarPUUtils::ExecOnWorkers(STARPU_OPENCL, [&](){
starpu_pthread_mutex_lock(&initMutex);
openclWrapper.initKernel(starpu_worker_get_id(), inKernels);
......@@ -211,7 +215,7 @@ protected:
p2m_cl.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportP2M(FSTARPU_OPENCL_IDX)){
p2m_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::bottomPassCallback;
p2m_cl.where |= STARPU_OPENCL;
......@@ -237,7 +241,7 @@ protected:
m2m_cl[idx].where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportM2M(FSTARPU_OPENCL_IDX)){
m2m_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::upwardPassCallback;
m2m_cl[idx].where |= STARPU_OPENCL;
......@@ -260,7 +264,7 @@ protected:
l2l_cl[idx].where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportL2L(FSTARPU_OPENCL_IDX)){
l2l_cl[idx].opencl_funcs[0] = StarPUOpenClWrapperClass::downardPassCallback;
l2l_cl[idx].where |= STARPU_OPENCL;
......@@ -290,7 +294,7 @@ protected:
l2p_cl.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportL2P(FSTARPU_OPENCL_IDX)){
l2p_cl.opencl_funcs[0] = StarPUOpenClWrapperClass::mergePassCallback;
l2p_cl.where |= STARPU_OPENCL;
......@@ -314,7 +318,7 @@ protected:
p2p_cl_in.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
p2p_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::directInPassCallback;
p2p_cl_in.where |= STARPU_OPENCL;
......@@ -336,7 +340,7 @@ protected:
p2p_cl_inout.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportP2P(FSTARPU_OPENCL_IDX)){
p2p_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::directInoutPassCallback;
p2p_cl_inout.where |= STARPU_OPENCL;
......@@ -360,7 +364,7 @@ protected:
m2l_cl_in.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
m2l_cl_in.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInPassCallback;
m2l_cl_in.where |= STARPU_OPENCL;
......@@ -383,7 +387,7 @@ protected:
m2l_cl_inout.where |= STARPU_CUDA;
}
#endif
#ifdef STARPU_USE_OPENCL
#ifdef ScalFMM_ENABLE_CUDA_KERNEL
if(originalCpuKernel->supportM2L(FSTARPU_OPENCL_IDX)){
m2l_cl_inout.opencl_funcs[0] = StarPUOpenClWrapperClass::transferInoutPassCallback;
m2l_cl_inout.where |= STARPU_OPENCL;
......
This diff is collapsed.
......@@ -8,9 +8,7 @@
/////////////////////////////////////////////////////
//extern "C"{
#include <starpu.h>
//}
/////////////////////////////////////////////////////
......@@ -23,8 +21,24 @@
#if defined(STARPU_USE_CUDA) && defined(ScalFMM_USE_CUDA)
#define ScalFMM_ENABLE_CUDA_KERNEL
#else
#if defined(STARPU_USE_CUDA) || defined(ScalFMM_USE_CUDA)
#warning CUDA is turned off because it is not supported by ScalFMM AND StarPU.
#if defined(STARPU_USE_CUDA)
#warning CUDA is turned off because it is not supported by ScalFMM.
#endif
#if defined(ScalFMM_USE_CUDA)
#warning CUDA is turned off because it is not supported by StarPU.
#endif
#endif
/////////////////////////////////////////////////////
#if defined(STARPU_USE_OPENCL) && defined(ScalFMM_USE_OPENCL)
#define ScalFMM_ENABLE_OPENCL_KERNEL
#else
#if defined(STARPU_USE_OPENCL)
#warning OPENCL is turned off because it is not supported by ScalFMM.
#endif
#if defined(ScalFMM_USE_OPENCL)
#warning OPENCL is turned off because it is not supported by StarPU.
#endif
#endif
......
This diff is collapsed.
// @SCALFMM_PRIVATE
#ifndef FOPENCLDEVICEWRAPPER_HPP
#define FOPENCLDEVICEWRAPPER_HPP
#include "../../Utils/FGlobal.hpp"
#include "../../Core/FCoreCommon.hpp"
#include "../../Utils/FQuickSort.hpp"
#include "../../Containers/FTreeCoordinate.hpp"
#include "../../Utils/FLog.hpp"
#include "../../Utils/FTic.hpp"
#include "../../Utils/FAssert.hpp"
#include "../../Utils/FAlignedMemory.hpp"
#include "../../Utils/FAssert.hpp"
#include "../FOutOfBlockInteraction.hpp"
#include <starpu.h>
template <class OriginalKernelClass, const char* KernelFilename>
class FOpenCLDeviceWrapper {
protected:
static void SetKernelArgs(cl_kernel& kernel, const int pos){
}
template <class ParamClass, class... Args>
static void SetKernelArgs(cl_kernel* kernel, const int pos, ParamClass* param, Args... args){
FAssertLF(clSetKernelArg(kernel, pos, sizeof(*param), param) == 0);
SetKernelArgs(kernel, pos+1, args...);
}
int workerId;
int workerDevid;
struct starpu_opencl_program opencl_code;
cl_kernel kernel_bottomPassPerform;
cl_command_queue queue_bottomPassPerform;
cl_kernel kernel_upwardPassPerform;
cl_command_queue queue_upwardPassPerform;
cl_kernel kernel_transferInoutPassPerformMpi;
cl_command_queue queue_transferInoutPassPerformMpi;
cl_kernel kernel_transferInPassPerform;
cl_command_queue queue_transferInPassPerform;
cl_kernel kernel_transferInoutPassPerform;
cl_command_queue queue_transferInoutPassPerform;
cl_kernel kernel_downardPassPerform;
cl_command_queue queue_downardPassPerform;
cl_kernel kernel_directInoutPassPerformMpi;
cl_command_queue queue_directInoutPassPerformMpi;
cl_kernel kernel_directInoutPassPerform;
cl_command_queue queue_directInoutPassPerform;
cl_kernel kernel_directInPassPerform;
cl_command_queue queue_directInPassPerform;
cl_kernel kernel_mergePassPerform;
cl_command_queue queue_mergePassPerform;
public:
FOpenCLDeviceWrapper() : workerId(0) , workerDevid(0){
workerId = starpu_worker_get_id();
workerDevid = starpu_worker_get_devid(workerId);
if(KernelFilename){
const int err = starpu_opencl_load_opencl_from_file(KernelFilename, &opencl_code, NULL);
if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
FAssertLF( starpu_opencl_load_kernel(&kernel_bottomPassPerform, &queue_bottomPassPerform, &opencl_code, "bottomPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_upwardPassPerform, &queue_upwardPassPerform, &opencl_code, "upwardPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_transferInoutPassPerformMpi, &queue_transferInoutPassPerformMpi, &opencl_code, "transferInoutPassPerformMpi", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_transferInPassPerform, &queue_transferInPassPerform, &opencl_code, "transferInPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_transferInoutPassPerform, &queue_transferInoutPassPerform, &opencl_code, "transferInoutPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_downardPassPerform, &queue_downardPassPerform, &opencl_code, "downardPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_directInoutPassPerformMpi, &queue_directInoutPassPerformMpi, &opencl_code, "directInoutPassPerformMpi", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_directInoutPassPerform, &queue_directInoutPassPerform, &opencl_code, "directInoutPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_directInPassPerform, &queue_directInPassPerform, &opencl_code, "directInPassPerform", workerDevid) == CL_SUCCESS);
FAssertLF( starpu_opencl_load_kernel(&kernel_mergePassPerform, &queue_mergePassPerform, &opencl_code, "mergePassPerform", workerDevid) == CL_SUCCESS);
}
}
virtual void initDeviceFromKernel(const OriginalKernelClass& /*originalKernel*/){
}
virtual void releaseKernel(){
}
~FOpenCLDeviceWrapper(){
// Release
releaseKernel();
if(KernelFilename){
const int err = starpu_opencl_unload_opencl(&opencl_code);
if(err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
}
void bottomPassPerform(unsigned char* leafCellsPtr, size_t leafCellsSize, unsigned char* containersPtr, size_t containersSize){
SetKernelArgs(&kernel_bottomPassPerform, 0, &leafCellsPtr, &leafCellsSize, &containersPtr, &containersSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_bottomPassPerform, kernel_bottomPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void upwardPassPerform(unsigned char* currentCellsPtr, size_t currentCellsSize, unsigned char* subCellGroupsPtr, size_t subCellGroupsSize, int nbSubCellGroups, int idxLevel){
SetKernelArgs(&kernel_upwardPassPerform, 0, &currentCellsPtr, &currentCellsSize, &subCellGroupsPtr, &subCellGroupsSize, &nbSubCellGroups, &idxLevel);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_upwardPassPerform, kernel_upwardPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void transferInoutPassPerformMpi(unsigned char* currentCellsPtr,
size_t currentCellsSize, unsigned char* externalCellsPtr, size_t externalCellsSize, int idxLevel, cl_mem outsideInteractionsCl,
size_t outsideInteractionsSize){
SetKernelArgs(&kernel_transferInoutPassPerformMpi, 0, &currentCellsPtr,&currentCellsSize, &externalCellsPtr, &externalCellsSize, &idxLevel, &outsideInteractionsCl,
&outsideInteractionsSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_transferInoutPassPerformMpi, kernel_transferInoutPassPerformMpi, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void transferInPassPerform(unsigned char* currentCellsPtr, size_t currentCellsSize, int idxLevel){
SetKernelArgs(&kernel_transferInPassPerform, 0, &currentCellsPtr, &currentCellsSize, &idxLevel);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_transferInPassPerform, kernel_transferInPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void transferInoutPassPerform(unsigned char* currentCellsPtr,
size_t currentCellsSize, unsigned char* externalCellsPtr, size_t externalCellsSize, int idxLevel, cl_mem outsideInteractionsCl,
size_t outsideInteractionsSize){
SetKernelArgs(&kernel_transferInoutPassPerform, 0, &currentCellsPtr,&currentCellsSize, &externalCellsPtr, &externalCellsSize, &idxLevel, &outsideInteractionsCl,&outsideInteractionsSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_transferInoutPassPerform, kernel_transferInoutPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void downardPassPerform(unsigned char* currentCellsPtr,
size_t currentCellsSize, unsigned char* subCellGroupsPtr, size_t subCellGroupsSize, int nbSubCellGroups, int idxLevel){
SetKernelArgs(&kernel_downardPassPerform, 0, &currentCellsPtr,
&currentCellsSize, &subCellGroupsPtr, &subCellGroupsSize, &nbSubCellGroups, &idxLevel);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_downardPassPerform, kernel_downardPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void directInoutPassPerformMpi(unsigned char* containersPtr,
size_t containersSize, unsigned char* externalContainersPtr, size_t externalContainersSize, cl_mem outsideInteractionsCl,
size_t outsideInteractionsSize){
SetKernelArgs(&kernel_directInoutPassPerformMpi, 0, &containersPtr,
&containersSize, &externalContainersPtr, &externalContainersSize, &outsideInteractionsCl,&outsideInteractionsSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_directInoutPassPerformMpi, kernel_directInoutPassPerformMpi, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void directInPassPerform(unsigned char* containersPtr, size_t containerSize){
SetKernelArgs(&kernel_directInPassPerform, 0, &containersPtr, &containerSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_directInPassPerform, kernel_directInPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void directInoutPassPerform(unsigned char* containersPtr,
size_t containerSize, unsigned char* externalContainersPtr, size_t externalContainersSize, cl_mem outsideInteractionsCl,
size_t outsideInteractionsSize){
SetKernelArgs(&kernel_directInoutPassPerform, 0, &containersPtr,
&containerSize, &externalContainersPtr, &externalContainersSize, &outsideInteractionsCl, &outsideInteractionsSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_directInoutPassPerform, kernel_directInoutPassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
void mergePassPerform(unsigned char* leafCellsPtr,
size_t leafCellsSize, unsigned char* containersPtr, size_t containersSize){
SetKernelArgs(&kernel_mergePassPerform, 0, &leafCellsPtr, &leafCellsSize, &containersPtr, &containersSize);
size_t dim = 1;
const int err = clEnqueueNDRangeKernel(queue_mergePassPerform, kernel_mergePassPerform, 1, NULL, &dim, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
};
#endif // FOPENCLDEVICEWRAPPER_HPP
......@@ -60,6 +60,12 @@
#cmakedefine ScalFMM_USE_CUDA
///////////////////////////////////////////////////////
// OPENCL
///////////////////////////////////////////////////////
#cmakedefine ScalFMM_USE_OPENCL
///////////////////////////////////////////////////////
// STARPU
///////////////////////////////////////////////////////
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment