diff --git a/src/proximity_ops.cu b/src/proximity_ops.cu index b7259c4556edaa7a951228f60783e390d0c640de..fbd56174cf6a1be1196e034f34f6e8f3b1b254b9 100644 --- a/src/proximity_ops.cu +++ b/src/proximity_ops.cu @@ -250,22 +250,81 @@ void __global__ kernel_prox_spcol(T* data, int32_t dlen, int32_t nrows, int32_t } } + +template<typename T> +void __global__ kernel_prox_spcol_shared(T* data, int32_t dlen, int32_t nrows, int32_t ncols, int32_t k, int32_t dev_id/*=-1*/, bool verbose/*=false*/, T* all_kg_data) +{ + + extern __shared__ int32_t indices_buf[]; + // can't put kg_data on shared memory too because of alignment issue (I tried and I got cudaMemcpyAsync error: cudaErrorMisalignedAddress at exec time) + // indices is the largest buffer so I chose it rather than kg_data (I tried to do the inverse, it's not better) + int col_id = blockDim.x * blockIdx.x + threadIdx.x; + T *a, *b, tmp; + int itmp; + int32_t* indices; + T* kg_data, *_data; + thrust::counting_iterator<int> ite_indices(0); // virtual list of indices from 0 to nrows + + T zero; + memset(&zero, 0, sizeof(zero)); // bzero doesn't work on MS VS + greater_abs<T> ga; + int32_t col_offset = nrows * col_id; + if(col_id < ncols) + { + indices = indices_buf + threadIdx.x * nrows; + kg_data = all_kg_data + k * col_id; + _data = data + col_offset; + // create the list of column indices + thrust::copy(thrust::device, ite_indices, ite_indices+nrows, indices); + // pack descendingly the k greatest values to the left of _data + for(int i=0;i < nrows - 1 && i < k; i++) + for(int j=i+1;j < nrows; j++) + { + a = _data+i; + b = _data+j; + if(ga(*b, *a)) + { + tmp = *a; + *a = *b; + *b = tmp; + itmp = indices[i]; + indices[i] = indices[j]; + indices[j] = itmp; + } + } + // copy the k greatest values in a tmp buf + for(int i=0; i < k; i++) + kg_data[i] = _data[i]; + // zero the _data column + for(int i=0;i < nrows; i++) + _data[i] = zero; + // copy back k greatest values to their original positions + for(int i=0; i < k; i++) + _data[indices[i]] = kg_data[i]; + } +} + template<typename T> void prox_spcol(T* data, int32_t dlen, int32_t ncols, int32_t k, int32_t dev_id/*=-1*/, bool verbose/*=false*/) { - int* indices; + const int32_t max_shared_mem = 48*1024; +// int* indices; T* kg_data; - T* cpu_data; - int threadsPerBlock = 256; int32_t nrows = dlen/ncols; - int blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock; - assert(cudaMalloc(&indices, sizeof(int)*nrows*ncols) == CUDA_SUCCESS); + int threadsPerBlock = 256; // default value + int blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock; // default value assert(cudaMalloc(&kg_data, sizeof(T)*k*ncols) == CUDA_SUCCESS); - kernel_prox_spcol<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices); - cudaMallocHost(&cpu_data, sizeof(T)*nrows*ncols); - cudaMemcpy(cpu_data, kg_data, sizeof(T)*k*ncols, cudaMemcpyDeviceToHost); - cudaFree(indices); +// assert(cudaMalloc(&indices, sizeof(int)*nrows*ncols) == CUDA_SUCCESS); // the two kernels below needs this +// kernel_prox_spcol<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices); // slower +// kernel_prox_spcol2<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices); // even slower + /******** kernel using shared memory **********/ + threadsPerBlock = max_shared_mem / nrows / sizeof(int32_t); + threadsPerBlock = threadsPerBlock>512?512:threadsPerBlock; // 512 is the max number of threads per block (event if I saw 1024 here https://forums.developer.nvidia.com/t/maximum-number-of-threads-on-thread-block/46392) + blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock; + kernel_prox_spcol_shared<T><<<blocksPerGrid,threadsPerBlock, nrows*threadsPerBlock*sizeof(int32_t)>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data); + faust_kernelSafe(); + /******************/ +// cudaFree(indices); cudaFree(kg_data); -// faust_kernelSafe(); } #include "proximity_ops.hu" diff --git a/test/test_gm.cpp.in b/test/test_gm.cpp.in index 534b6250541b1398f340c33c3b0fc7206febee17..457f3b7a53b99946f46fadd92632a14d6bb425ad 100644 --- a/test/test_gm.cpp.in +++ b/test/test_gm.cpp.in @@ -113,8 +113,8 @@ void test_gpu_dsm_prox_sp(gm_DenseMatFunc_@GM_SCALAR@ & dsm_funcs) void test_gpu_dsm_prox_spcol(gm_DenseMatFunc_@GM_SCALAR@ & dsm_funcs) { cout << string(5, '-') << " prox_spcol"<< endl; - auto nrows = 60; - auto ncols = 70; + auto nrows = 6; + auto ncols = 7; auto k = 3; @GM_SCALAR_REAL@ err; @GM_SCALAR@ *a, *b, tmp;