FStarPUCudaWrapper.hpp 15.5 KB
Newer Older
1
// @SCALFMM_PRIVATE
2 3 4 5 6 7 8 9 10 11 12 13 14
#ifndef FSTARPUCUDAWRAPPER_HPP
#define FSTARPUCUDAWRAPPER_HPP

#include "../Utils/FGlobal.hpp"
#include "../Core/FCoreCommon.hpp"
#include "../Utils/FQuickSort.hpp"
#include "../Containers/FTreeCoordinate.hpp"
#include "../Utils/FLog.hpp"
#include "../Utils/FTic.hpp"
#include "../Utils/FAssert.hpp"
#include "../Utils/FAlignedMemory.hpp"
#include "../Utils/FAssert.hpp"

15 16
#include "FOutOfBlockInteraction.hpp"

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
#ifdef ScalFMM_USE_MPI
#include "../Utils/FMpi.hpp"
#endif

#include <vector>
#include <memory>

#include <omp.h>

#include <starpu.h>

#ifdef STARPU_USE_MPI
#include <starpu_mpi.h>
#endif

32 33
#include "Cuda/FCudaDeviceWrapper.hpp"

34 35
#include "FStarPUUtils.hpp"

36
template <class KernelClass, class CellClass, class CudaCellGroupClass,
37 38
          class CudaParticleGroupClass, class CudaParticleContainerClass,
          class CudaKernelClass>
39 40
class FStarPUCudaWrapper {
protected:
41
    typedef FStarPUCudaWrapper<KernelClass, CellClass, CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass> ThisClass;
42 43 44 45 46 47 48 49 50

    template <class OtherBlockClass>
    struct BlockInteractions{
        OtherBlockClass* otherBlock;
        int otherBlockId;
        std::vector<OutOfBlockInteraction> interactions;
    };

    const int treeHeight;
51
    CudaKernelClass* kernels[STARPU_MAXCUDADEVS];        //< The kernels
52 53 54

public:
    FStarPUCudaWrapper(const int inTreeHeight): treeHeight(inTreeHeight){
55
        memset(kernels, 0, sizeof(CudaKernelClass*)*STARPU_MAXCUDADEVS);
56 57 58 59
    }

    void initKernel(const int workerId, KernelClass* originalKernel){
        FAssertLF(kernels[workerId] == nullptr);
60
        kernels[workerId] = FCuda__BuildCudaKernel<CudaKernelClass>(originalKernel);
61 62
    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
63 64 65 66 67
    void releaseKernel(const int workerId){
        FCuda__ReleaseCudaKernel(kernels[workerId]);
        kernels[workerId] = nullptr;
    }

68 69
    ~FStarPUCudaWrapper(){
        for(int idxKernel = 0 ; idxKernel < STARPU_MAXCUDADEVS ; ++idxKernel ){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
70
            FAssertLF(kernels[idxKernel] == nullptr);
71 72 73 74
        }
    }

    static void bottomPassCallback(void *buffers[], void *cl_arg){
75 76 77 78
        //CudaCellGroupClass leafCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        //CudaParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
79 80 81
        FStarPUPtrInterface* worker = nullptr;
        starpu_codelet_unpack_args(cl_arg, &worker);

82 83
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];

84
        FCuda__bottomPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
85 86 87
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
88
                kernel, starpu_cuda_get_local_stream());
89 90 91 92 93 94 95
    }

    /////////////////////////////////////////////////////////////////////////////////////
    /// Upward Pass
    /////////////////////////////////////////////////////////////////////////////////////

    static void upwardPassCallback(void *buffers[], void *cl_arg){
96 97
        //CudaCellGroupClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
98 99 100 101 102 103

        FStarPUPtrInterface* worker = nullptr;
        int nbSubCellGroups = 0;
        int idxLevel = 0;
        starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel);

BRAMAS Berenger's avatar
BRAMAS Berenger committed
104 105 106 107
        FCudaParams<unsigned char*,9> subCellGroupsPtr;
        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
        FCudaParams<std::size_t,9> subCellGroupsSize;
        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
108
        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
109 110
            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1]));
            subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1]);
111 112
        }

113
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
114

115
        FCuda__upwardPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
116 117
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                subCellGroupsPtr,subCellGroupsSize,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
118
                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream());
119 120 121 122 123 124 125
    }

    /////////////////////////////////////////////////////////////////////////////////////
    /// Transfer Pass Mpi
    /////////////////////////////////////////////////////////////////////////////////////
#ifdef STARPU_USE_MPI
    static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
126 127 128 129
        // CudaCellGroupClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                                 STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        // CudaCellGroupClass externalCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
130 131 132 133 134 135

        FStarPUPtrInterface* worker = nullptr;
        int idxLevel = 0;
        const std::vector<OutOfBlockInteraction>* outsideInteractions;
        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions);

136
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
137

138
        FCuda__transferInoutPassCallbackMpi<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
139 140 141
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
142 143
                idxLevel, outsideInteractions->data(), outsideInteractions->size(), kernel,
                starpu_cuda_get_local_stream());
144 145 146 147 148 149 150
    }
#endif
    /////////////////////////////////////////////////////////////////////////////////////
    /// Transfer Pass
    /////////////////////////////////////////////////////////////////////////////////////

    static void transferInPassCallback(void *buffers[], void *cl_arg){
151
        //FAssertLF(STARPU_VARIABLE_GET_PTR(buffers[0]) == STARPU_VARIABLE_GET_PTR(buffers[1]));
152 153
        //CudaCellGroupClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
154 155 156 157 158

        FStarPUPtrInterface* worker = nullptr;
        int idxLevel = 0;
        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel);

159
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
160

161
        FCuda__transferInPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
162
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
163
                idxLevel, kernel, starpu_cuda_get_local_stream());
164 165 166
    }

    static void transferInoutPassCallback(void *buffers[], void *cl_arg){
167 168
        //FAssertLF(STARPU_VARIABLE_GET_PTR(buffers[0]) == STARPU_VARIABLE_GET_PTR(buffers[2]));
        //FAssertLF(STARPU_VARIABLE_GET_PTR(buffers[1]) == STARPU_VARIABLE_GET_PTR(buffers[3]));
169

170 171 172 173
        // CudaCellGroupClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        // CudaCellGroupClass externalCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
174 175 176 177 178 179

        FStarPUPtrInterface* worker = nullptr;
        int idxLevel = 0;
        const std::vector<OutOfBlockInteraction>* outsideInteractions;
        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions);

180
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
181

182
        FCuda__transferInoutPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
183 184 185
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
186 187
                idxLevel, outsideInteractions->data(), outsideInteractions->size(), kernel,
                starpu_cuda_get_local_stream());
188 189 190 191 192 193
    }

    /////////////////////////////////////////////////////////////////////////////////////
    /// Downard Pass
    /////////////////////////////////////////////////////////////////////////////////////
    static void downardPassCallback(void *buffers[], void *cl_arg){
194 195
        //CudaCellGroupClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
196 197 198 199 200 201

        FStarPUPtrInterface* worker = nullptr;
        int nbSubCellGroups = 0;
        int idxLevel = 0;
        starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel);

BRAMAS Berenger's avatar
BRAMAS Berenger committed
202 203 204 205
        FCudaParams<unsigned char*,9> subCellGroupsPtr;
        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
        FCudaParams<std::size_t,9> subCellGroupsSize;
        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
206
        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
207 208
            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[idxSubGroup+1]));
            subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[idxSubGroup+1]));
209 210
        }

211
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
212

213
        FCuda__downardPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
214 215
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                subCellGroupsPtr,subCellGroupsSize,
BRAMAS Berenger's avatar
BRAMAS Berenger committed
216
                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream());
217 218 219 220 221 222 223
    }
    /////////////////////////////////////////////////////////////////////////////////////
    /// Direct Pass MPI
    /////////////////////////////////////////////////////////////////////////////////////

#ifdef STARPU_USE_MPI
    static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){
224 225 226 227
        //CudaParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                              STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        //CudaParticleGroupClass externalContainers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                              STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
228 229 230 231 232

        FStarPUPtrInterface* worker = nullptr;
        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
        starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions);

233
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
234

235
        FCuda__directInoutPassCallbackMpi<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
236 237 238
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
239 240
                outsideInteractions->data(), outsideInteractions->size(),
                worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight ,kernel, starpu_cuda_get_local_stream());
241 242 243 244 245 246 247
    }
#endif
    /////////////////////////////////////////////////////////////////////////////////////
    /// Direct Pass
    /////////////////////////////////////////////////////////////////////////////////////

    static void directInPassCallback(void *buffers[], void *cl_arg){
248 249
        // CudaParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                              STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
250 251 252

        FStarPUPtrInterface* worker = nullptr;
        starpu_codelet_unpack_args(cl_arg, &worker);
253
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
254

255
        FCuda__directInPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
256
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
257
                worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight, kernel, starpu_cuda_get_local_stream());
258 259 260
    }

    static void directInoutPassCallback(void *buffers[], void *cl_arg){
261 262 263 264
        // CudaParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                              STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        // CudaParticleGroupClass externalContainers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                              STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
265 266 267 268 269

        FStarPUPtrInterface* worker = nullptr;
        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
        starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions);

270
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
271

272
        FCuda__directInoutPassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
273 274 275
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
276 277
                outsideInteractions->data(), outsideInteractions->size(), worker->get<ThisClass>(FSTARPU_CPU_IDX)->treeHeight,
                kernel, starpu_cuda_get_local_stream());
278 279
    }

280

281 282 283 284 285
    /////////////////////////////////////////////////////////////////////////////////////
    /// Merge Pass
    /////////////////////////////////////////////////////////////////////////////////////

    static void mergePassCallback(void *buffers[], void *cl_arg){
286 287 288 289
        // CudaCellGroupClass leafCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
        //                             STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]));
        // CudaParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
        //                             STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]));
290 291 292 293

        FStarPUPtrInterface* worker = nullptr;
        starpu_codelet_unpack_args(cl_arg, &worker);

294
        CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
295

296
        FCuda__mergePassCallback<CellClass,CudaCellGroupClass, CudaParticleGroupClass, CudaParticleContainerClass, CudaKernelClass>((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
297 298 299
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                STARPU_VARIABLE_GET_ELEMSIZE(buffers[1]),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
300
                kernel, starpu_cuda_get_local_stream());
301 302 303 304 305 306
    }
};


#endif // FSTARPUCUDAWRAPPER_HPP