diff --git a/src/proximity_ops.cu b/src/proximity_ops.cu
index b7259c4556edaa7a951228f60783e390d0c640de..fbd56174cf6a1be1196e034f34f6e8f3b1b254b9 100644
--- a/src/proximity_ops.cu
+++ b/src/proximity_ops.cu
@@ -250,22 +250,81 @@ void __global__ kernel_prox_spcol(T* data, int32_t dlen, int32_t nrows, int32_t
 	}
 }
 
+
+template<typename T>
+void __global__ kernel_prox_spcol_shared(T* data, int32_t dlen, int32_t nrows, int32_t ncols, int32_t k, int32_t dev_id/*=-1*/, bool verbose/*=false*/, T* all_kg_data)
+{
+
+	extern __shared__ int32_t indices_buf[];
+	// can't put kg_data on shared memory too because of alignment issue (I tried and I got cudaMemcpyAsync error: cudaErrorMisalignedAddress at exec time)
+	// indices is the largest buffer so I chose it rather than kg_data (I tried to do the inverse, it's not better)
+    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+	T *a, *b, tmp;
+	int itmp;
+	int32_t* indices;
+	T* kg_data, *_data;
+	thrust::counting_iterator<int> ite_indices(0); // virtual list of indices from 0 to nrows
+
+	T zero;
+	memset(&zero, 0, sizeof(zero)); // bzero doesn't work on MS VS
+	greater_abs<T> ga;
+	int32_t col_offset = nrows * col_id;
+	if(col_id < ncols)
+	{
+		indices = indices_buf + threadIdx.x * nrows;
+		kg_data = all_kg_data + k * col_id;
+		_data = data + col_offset;
+		// create the list of column indices
+		thrust::copy(thrust::device, ite_indices, ite_indices+nrows, indices);
+		// pack descendingly the k greatest values to the left of _data
+		for(int i=0;i < nrows - 1 && i < k; i++)
+			for(int j=i+1;j < nrows; j++)
+			{
+				a = _data+i;
+				b = _data+j;
+				if(ga(*b, *a))
+				{
+					tmp = *a;
+					*a = *b;
+					*b = tmp;
+					itmp = indices[i];
+					indices[i] = indices[j];
+					indices[j] = itmp;
+				}
+			}
+		// copy the k greatest values in a tmp buf
+		for(int i=0; i < k; i++)
+			kg_data[i] = _data[i];
+		// zero the _data column
+		for(int i=0;i < nrows; i++)
+			_data[i] = zero;
+		// copy back k greatest values to their original positions
+		for(int i=0; i < k; i++)
+			_data[indices[i]] = kg_data[i];
+	}
+}
+
 template<typename T> void prox_spcol(T* data, int32_t dlen, int32_t ncols, int32_t k, int32_t dev_id/*=-1*/, bool verbose/*=false*/)
 {
-	int* indices;
+	const int32_t max_shared_mem = 48*1024;
+//	int* indices;
 	T* kg_data;
-	T* cpu_data;
-	int threadsPerBlock = 256;
 	int32_t nrows = dlen/ncols;
-	int blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock;
-	assert(cudaMalloc(&indices, sizeof(int)*nrows*ncols) == CUDA_SUCCESS);
+	int threadsPerBlock = 256; // default value
+	int blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock; // default value
 	assert(cudaMalloc(&kg_data, sizeof(T)*k*ncols) == CUDA_SUCCESS);
-	kernel_prox_spcol<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices);
-	cudaMallocHost(&cpu_data, sizeof(T)*nrows*ncols);
-	cudaMemcpy(cpu_data, kg_data, sizeof(T)*k*ncols, cudaMemcpyDeviceToHost);
-	cudaFree(indices);
+//	assert(cudaMalloc(&indices, sizeof(int)*nrows*ncols) == CUDA_SUCCESS); // the two kernels below needs this
+//	kernel_prox_spcol<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices); // slower
+//	kernel_prox_spcol2<T><<<blocksPerGrid,threadsPerBlock>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data, indices); // even slower
+	/******** kernel using shared memory **********/
+	threadsPerBlock = max_shared_mem / nrows / sizeof(int32_t);
+	threadsPerBlock = threadsPerBlock>512?512:threadsPerBlock; // 512 is the max number of threads per block (event if I saw 1024 here https://forums.developer.nvidia.com/t/maximum-number-of-threads-on-thread-block/46392)
+	blocksPerGrid = (ncols + threadsPerBlock - 1) / threadsPerBlock;
+	kernel_prox_spcol_shared<T><<<blocksPerGrid,threadsPerBlock, nrows*threadsPerBlock*sizeof(int32_t)>>>(data, dlen, nrows, ncols, k, dev_id, verbose, kg_data);
+	faust_kernelSafe();
+	/******************/
+//	cudaFree(indices);
 	cudaFree(kg_data);
-//	faust_kernelSafe();
 }
 
 #include "proximity_ops.hu"
diff --git a/test/test_gm.cpp.in b/test/test_gm.cpp.in
index 534b6250541b1398f340c33c3b0fc7206febee17..457f3b7a53b99946f46fadd92632a14d6bb425ad 100644
--- a/test/test_gm.cpp.in
+++ b/test/test_gm.cpp.in
@@ -113,8 +113,8 @@ void test_gpu_dsm_prox_sp(gm_DenseMatFunc_@GM_SCALAR@ & dsm_funcs)
 void test_gpu_dsm_prox_spcol(gm_DenseMatFunc_@GM_SCALAR@ & dsm_funcs)
 {
 	cout << string(5, '-') << " prox_spcol"<< endl;
-	auto nrows = 60;
-	auto ncols = 70;
+	auto nrows = 6;
+	auto ncols = 7;
 	auto k = 3;
 	@GM_SCALAR_REAL@ err;
 	@GM_SCALAR@ *a, *b, tmp;