From 8ca251cc858c3bb6a746a8cf6965e5657d92f39f Mon Sep 17 00:00:00 2001 From: hhadjdji <hhadjdji hakim.hadj-djilani@inria.fr> Date: Mon, 28 Sep 2020 08:52:58 +0200 Subject: [PATCH] Add new operations to cuMatDs and link them into the C interface. - Matrix element-wise multiplication and division (elt_wise_mul/div). - Matrix initialization of all coefficients to a constant (set_val). - Matrix sum of all coefficients (sum). - Matrix mean relative error (mean_relerr) against a reference matrix. - New corresponding kernel specializations for cuda complex types. --- src/cuMatDs.cpp.in | 51 ++++++++++++++++++ src/cuMatDs.h | 6 ++- src/cuMatDs.hpp | 11 +++- src/gm.cpp.in | 25 +++++++++ src/gm_interf_gen.h.in | 18 +++++-- src/gm_interf_gen.hpp.in | 5 ++ src/kernel_def.hu | 21 ++++++++ src/kernels.cu | 108 ++++++++++++++++++++++++++++++++++++++- src/kernels.h | 5 ++ 9 files changed, 242 insertions(+), 8 deletions(-) diff --git a/src/cuMatDs.cpp.in b/src/cuMatDs.cpp.in index 79f1d6d..69d0d70 100644 --- a/src/cuMatDs.cpp.in +++ b/src/cuMatDs.cpp.in @@ -26,3 +26,54 @@ void cuMatDs<@GM_SCALAR@>::abs() #endif switch_back(); } + +template<> + void cuMatDs<@GM_SCALAR@>::elt_wise_div(const cuMatDs<@GM_SCALAR@>& M) +{ + if(nrows != M.nrows || ncols != M.ncols) + throw std::runtime_error("Dimensions must agree."); +#if(IS_COMPLEX==1) + kernel_div_cplx(data, M.data, numel()); +#else + kernel_div(data, M.data, numel()); +#endif +} + +template<> + void cuMatDs<@GM_SCALAR@>::elt_wise_mul(const cuMatDs<@GM_SCALAR@>& M) +{ + if(nrows != M.nrows || ncols != M.ncols) + throw std::runtime_error("Dimensions must agree."); +#if(IS_COMPLEX==1) + kernel_mult_cplx(data, M.data, numel()); +#else + kernel_mult(data, M.data, numel()); +#endif +} + +template<> + void cuMatDs<@GM_SCALAR@>::set_val(const @GM_SCALAR@& val) +{ +#if(IS_COMPLEX==1) + kernel_memset_cplx(data, val, numel()); +#else + kernel_memset(data, val, numel()); +#endif +} + +template<> +@GM_SCALAR@ cuMatDs<@GM_SCALAR@>::mean_relerr(const cuMatDs<@GM_SCALAR@>& ref) +{ + if(ref.nrows != nrows || ref.ncols != ncols) + throw std::runtime_error("Dimensions must agree."); + cuMatDs<@GM_SCALAR@>* rel_err = cuMatDs<@GM_SCALAR@>::create(nrows, ncols); +#if(IS_COMPLEX==1) + kernel_relative_error_cplx(rel_err->data, ref.data, data, numel()); +#else + kernel_relative_error(rel_err->data, ref.data, data, numel()); +#endif + auto s = rel_err->sum(); + auto err = s/Real<@GM_SCALAR@>(numel()); + delete rel_err; + return err; +} diff --git a/src/cuMatDs.h b/src/cuMatDs.h index 093ac9a..ebde195 100644 --- a/src/cuMatDs.h +++ b/src/cuMatDs.h @@ -46,9 +46,10 @@ struct cuMatDs : cuMat<T> cuMatDs<T>* mul(cuMatDs<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP); void mul(cuMatDs<T>&, hMatDs<T>& output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP); void mul(cuMatDs<T>&, T* output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP); - cuMatDs<T>* mul(cuMatSp<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP); void mul(const T& scalar); + void elt_wise_mul(const cuMatDs<T>&); + void elt_wise_div(const cuMatDs<T>&); void apply_op(const gm_Op op); static cuMatDs<T>* apply_op(const cuMatDs<T>* in, gm_Op op, cuMatDs<T>* out = nullptr); void transpose(); @@ -78,12 +79,15 @@ struct cuMatDs : cuMat<T> void setOnes(); void set_zeros(); void set_eyes(); + void set_val(const T& val); cuMatDs<T>* clone(int32_t dev=-1); void copy(cuMatDs<T>*) const; void resize(const int32_t nrows, const int32_t ncols); T trace(); T coeff_max(); T coeff_min(); + T sum(); + T mean_relerr(const cuMatDs<T>&); void abs(); void destroy(); void set_stream(const void* stream); diff --git a/src/cuMatDs.hpp b/src/cuMatDs.hpp index 169d96b..43895f0 100644 --- a/src/cuMatDs.hpp +++ b/src/cuMatDs.hpp @@ -305,7 +305,6 @@ cuMatDs<T>* cuMatDs<T>::mul(cuMatSp<T>& other, cuMatDs<T>* output /* = nullptr*/ k = other.ncols; set_one(&alpha); bzero(&beta, sizeof(T)); - //TODO: refactor to avoid calling cusparseTcsrmm2 for each different cases (set parameters) if(op_this == op_other && op_this == OP_NOTRANSP) { n = this->nrows; @@ -703,6 +702,16 @@ T cuMatDs<T>::coeff_min() return min; } +template<typename T> +T cuMatDs<T>::sum() +{ + auto switch_back = switch_dev(this->dev_id); + auto len = numel(); + auto s = faust_cu_sum(data, len); + switch_back(); + return s; +} + template<typename T> T cuMatDs<T>::trace() { diff --git a/src/gm.cpp.in b/src/gm.cpp.in index 5fc628e..1ec9879 100644 --- a/src/gm.cpp.in +++ b/src/gm.cpp.in @@ -4,6 +4,26 @@ extern "C" { //TODO: separate in multiple files (one at least per type gm_DenseMat_t, gm_SparseMat_t, gm_MatArray_t) + __DYNLIB_ATTR__ void gm_DenseMat_elt_wise_div_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b) + { + static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_div(*static_cast<cuMatDs<@GM_SCALAR@>*>(b)); + } + + __DYNLIB_ATTR__ void gm_DenseMat_setval_@GM_SCALAR@(gm_DenseMat_t a, @GM_SCALAR@* v) + { + static_cast<cuMatDs<@GM_SCALAR@>*>(a)->set_val(*v); + } + + __DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_mean_relerr_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b, @GM_SCALAR@* mre) + { + *mre = static_cast<cuMatDs<@GM_SCALAR@>*>(a)->mean_relerr(*static_cast<cuMatDs<@GM_SCALAR@>*>(b)); + } + + __DYNLIB_ATTR__ void gm_DenseMat_elt_wise_mul_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b) + { + static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_mul(*static_cast<cuMatDs<@GM_SCALAR@>*>(b)); + } + __DYNLIB_ATTR__ void gm_DenseMat_setzeros_@GM_SCALAR@(gm_DenseMat_t src_mat) { static_cast<cuMatDs<@GM_SCALAR@>*>(src_mat)->set_zeros(); @@ -220,6 +240,11 @@ extern "C" return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->norm_l1(); } + __DYNLIB_ATTR__ void gm_DenseMat_sum_@GM_SCALAR@(gm_DenseMat_t mat, @GM_SCALAR@* sum) + { + *sum = static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->sum(); + } + __DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_power_iteration_@GM_SCALAR@(gm_DenseMat_t mat, float threshold, int32_t max_iter) { return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->power_iteration(threshold, max_iter); diff --git a/src/gm_interf_gen.h.in b/src/gm_interf_gen.h.in index 7c500b3..dce59c0 100644 --- a/src/gm_interf_gen.h.in +++ b/src/gm_interf_gen.h.in @@ -90,13 +90,13 @@ typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_l1_@GM_SCALAR@_ptr)(gm_DenseMat_t); typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_frob_@GM_SCALAR@_ptr)(gm_DenseMat_t); /** Computes the spectral norm */ typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_spectral_@GM_SCALAR@_ptr)(gm_DenseMat_t, float threshold, int32_t max_iter); -/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense).*/ +/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses).*/ typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t); -/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) -- extended version (ops and output matrix).*/ +/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) -- extended version (ops and output matrix).*/ typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, gm_DenseMat_t output, gm_Op, gm_Op); -/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer.*/ +/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer.*/ typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out); -/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/ +/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/ typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out, gm_Op, gm_Op); /** Multiplies the first argument GPU dense matrix to the second argument GPU sparse matrix. */ typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_spm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_SparseMat_t); @@ -126,6 +126,11 @@ typedef void (*gm_DenseMat_mv_to_gpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, int32_t dev typedef void (*gm_DenseMat_gemm_@GM_SCALAR@_ptr)(const gm_DenseMat_t, const gm_DenseMat_t, gm_DenseMat_t, const @GM_SCALAR@ *alpha, const @GM_SCALAR@ *beta, const gm_Op, const gm_Op); typedef int32_t (*gm_DenseMat_get_dev_@GM_SCALAR@_ptr)(gm_DenseMat_t); typedef size_t (*gm_DenseMat_get_nnz_@GM_SCALAR@_ptr)(gm_DenseMat_t); +typedef void (*gm_DenseMat_sum_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* sum); +typedef void (*gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t); +typedef void (*gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t); +typedef void (*gm_DenseMat_setval_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* v); +typedef @GM_SCALAR@ (*gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* mre); struct gm_DenseMatFunc_@GM_SCALAR@ { @@ -168,6 +173,7 @@ struct gm_DenseMatFunc_@GM_SCALAR@ gm_DenseMat_setones_@GM_SCALAR@_ptr setones; gm_DenseMat_setzeros_@GM_SCALAR@_ptr setzeros; gm_DenseMat_seteyes_@GM_SCALAR@_ptr seteyes; + gm_DenseMat_setval_@GM_SCALAR@_ptr setval; gm_DenseMat_transpose_@GM_SCALAR@_ptr transpose; gm_DenseMat_conjugate_@GM_SCALAR@_ptr conjugate; gm_DenseMat_abs_@GM_SCALAR@_ptr abs; @@ -176,6 +182,10 @@ struct gm_DenseMatFunc_@GM_SCALAR@ gm_DenseMat_gemm_@GM_SCALAR@_ptr gemm; gm_DenseMat_get_dev_@GM_SCALAR@_ptr get_dev; gm_DenseMat_get_nnz_@GM_SCALAR@_ptr get_nnz; + gm_DenseMat_sum_@GM_SCALAR@_ptr sum; + gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr elt_wise_mul; + gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr elt_wise_div; + gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr mean_relerr; }; /** MatArray function pointers ********************************************************************/ diff --git a/src/gm_interf_gen.hpp.in b/src/gm_interf_gen.hpp.in index 1c76aff..5895680 100644 --- a/src/gm_interf_gen.hpp.in +++ b/src/gm_interf_gen.hpp.in @@ -78,6 +78,11 @@ void load_dsm_funcs_@GM_SCALAR@(void* gm_handle, gm_DenseMatFunc_@GM_SCALAR@ *ds dsm_funcs->gemm = (gm_DenseMat_gemm_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle,"gm_DenseMat_gemm_@GM_SCALAR@"); dsm_funcs->get_dev = (gm_DenseMat_get_dev_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_dev_@GM_SCALAR@"); dsm_funcs->get_nnz = (gm_DenseMat_get_nnz_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_nnz_@GM_SCALAR@"); + dsm_funcs->sum = (gm_DenseMat_sum_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_sum_@GM_SCALAR@"); + dsm_funcs->elt_wise_mul = (gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_mul_@GM_SCALAR@"); + dsm_funcs->elt_wise_div = (gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_div_@GM_SCALAR@"); + dsm_funcs->setval = (gm_DenseMat_setval_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_setval_@GM_SCALAR@"); + dsm_funcs->mean_relerr = (gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_mean_relerr_@GM_SCALAR@"); } void load_marr_funcs_@GM_SCALAR@(void* gm_handle, gm_MatArrayFunc_@GM_SCALAR@ *marr_funcs) diff --git a/src/kernel_def.hu b/src/kernel_def.hu index 8521d17..ab4e90c 100644 --- a/src/kernel_def.hu +++ b/src/kernel_def.hu @@ -43,7 +43,9 @@ template void kernel_add<float>(float* d_cu1, const float* d_cu2, int length); template void kernel_sub<float>(float* d_cu1, const float* d_cu2, int length); template void kernel_mult<float>(float* d_cu1, const float* d_cu2, int length); +template void kernel_mult_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length); template void kernel_div<float>(float* d_cu1, const float* d_cu2, int length); +template void kernel_div_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length); template void kernel_add_const<float>(float* d_cu1, float valeur, int length); template void kernel_add_const_cplx<float2>(float2* d_cu1, float2 valeur, int length); template void kernel_sub_const<float>(float* d_cu1, float valeur, int length); @@ -58,6 +60,7 @@ template void kernel_abs<float>(float* d_cu1, const int length); template void kernel_abs_cplx<float2>(float2* d_cu1, const int length); template void kernel_memcpy<float,float>(float* d_cu_dst, const float* d_cu_src, int length); template void kernel_memset<float>(float* d_cu_dst, float valeur, int length); +template void kernel_memset_cplx<float2>(float2* d_cu_dst, float2 valeur, int length); template void kernel_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1, const int src_dim2); template void kernel_add_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1); template void kernel_sub_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1); @@ -67,11 +70,14 @@ template void kernel_add_diag_const<float>(float* d_cu1, float val, int dim1); template void kernel_copy_diag<float>(float* d_cu_dst, float* d_cu_src, int dim1); template void kernel_set_submatrix<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col); template void kernel_relative_error<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length); +template void kernel_relative_error_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length); template void kernel_add<double>(double* d_cu1, const double* d_cu2, int length); template void kernel_sub<double>(double* d_cu1, const double* d_cu2, int length); template void kernel_mult<double>(double* d_cu1, const double* d_cu2, int length); +template void kernel_mult_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length); template void kernel_div<double>(double* d_cu1, const double* d_cu2, int length); +template void kernel_div_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length); template void kernel_add_const<double>(double* d_cu1, double valeur, int length); template void kernel_add_const_cplx<double2>(double2* d_cu1, double2 valeur, int length); template void kernel_sub_const<double>(double* d_cu1, double valeur, int length); @@ -86,6 +92,7 @@ template void kernel_abs<double>(double* d_cu1, const int length); template void kernel_abs_cplx<double2>(double2* d_cu1, const int length); template void kernel_memcpy<double,double>(double* d_cu_dst, const double* d_cu_src, int length); template void kernel_memset<double>(double* d_cu_dst, double valeur, int length); +template void kernel_memset_cplx<double2>(double2* d_cu_dst, double2 valeur, int length); template void kernel_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1, const int src_dim2); template void kernel_add_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1); template void kernel_sub_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1); @@ -95,6 +102,7 @@ template void kernel_add_diag_const<double>(double* d_cu1, double val, int dim1) template void kernel_copy_diag<double>(double* d_cu_dst, double* d_cu_src, int dim1); template void kernel_set_submatrix<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col); template void kernel_relative_error<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length); +template void kernel_relative_error_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length); template void kernel_add<int>(int* d_cu1, const int* d_cu2, int length); template void kernel_sub<int>(int* d_cu1, const int* d_cu2, int length); @@ -115,6 +123,8 @@ template void kernel_set_submatrix<int>(int* mat_dst, int* mat_src, int src_dim1 template void kernel_memcpy<double,float>(double* d_cu_dst, const float* d_cu_src, int length); template void kernel_memcpy<float,double>(float* d_cu_dst, const double* d_cu_src, int length); +template void kernel_add_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length); +template void kernel_add_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length); @@ -133,11 +143,13 @@ template __global__ void Inv_inria<float>(float* A, int numElements); template __global__ void Abs_inria<float>(float* A, int numElements); template __global__ void Memcpy_inria<float,float>(float* d_cu_dst, const float* d_cu_src, int length); template __global__ void Memset_inria<float>(float* dev_dst, float valeur, int numElements); +template __global__ void Memset_inria_cplx<float2>(float2* dev_dst, float2 valeur, int numElements); template __global__ void Sparse2full_inria<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1); template __global__ void AddDiagConst_inria<float>(float* dev_dst, float val, int dim1); template __global__ void GetDiag_inria<float>(float* dst_diag, const float* src_M, int dim1, int dlen); template __global__ void SetSubmatrix_inria<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements); template __global__ void RelativeError_inria<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length); +template __global__ void RelativeError_inria_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length); template __global__ void Add_inria<double>(double* A, const double *B, int numElements); template __global__ void Sub_inria<double>(double* A, const double *B, int numElements); @@ -152,11 +164,13 @@ template __global__ void Inv_inria<double>(double* A, int numElements); template __global__ void Abs_inria<double>(double* A, int numElements); template __global__ void Memcpy_inria<double,double>(double* d_cu_dst, const double* d_cu_src, int length); template __global__ void Memset_inria<double>(double* dev_dst, double valeur, int numElements); +template __global__ void Memset_inria_cplx<double2>(double2* dev_dst, double2 valeur, int numElements); template __global__ void Sparse2full_inria<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1); template __global__ void AddDiagConst_inria<double>(double* dev_dst, double val, int dim1); template __global__ void GetDiag_inria<double>(double* dst_diag, const double* src_M, int dim1, int dlen); template __global__ void SetSubmatrix_inria<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements); template __global__ void RelativeError_inria<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length); +template __global__ void RelativeError_inria_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length); template __global__ void Add_inria<int>(int* A, const int *B, int numElements); template __global__ void Sub_inria<int>(int* A, const int *B, int numElements); @@ -184,3 +198,10 @@ template __global__ void finish_min_max_reduce<double2>(double2 *g_idata, double template __global__ void Abs_inria_cplx<double2>(double2* A, int numElements); template __global__ void Abs_inria_cplx<float2>(float2* A, int numElements); + +template __global__ void Add_inria_cplx<float2>(float2* A, const float2 *B, int numElements); +template __global__ void Add_inria_cplx<double2>(double2* A, const double2 *B, int numElements); +template __global__ void Mult_inria_cplx<float2>(float2* A, const float2 *B, int numElements); +template __global__ void Mult_inria_cplx<double2>(double2* A, const double2 *B, int numElements); +template __global__ void Div_inria_cplx<float2>(float2* A, const float2 *B, int numElements); +template __global__ void Div_inria_cplx<double2>(double2* A, const double2 *B, int numElements); diff --git a/src/kernels.cu b/src/kernels.cu index 2d66254..56de63e 100644 --- a/src/kernels.cu +++ b/src/kernels.cu @@ -43,9 +43,12 @@ //prototypes template<typename T> __global__ void Add_inria(T *A, const T *B, int numElements); +template<typename T> __global__ void Add_inria_cplx(T *A, const T *B, int numElements); template<typename T> __global__ void Sub_inria(T *A, const T *B, int numElements); template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElements); +template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements); template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements); +template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements); template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements); template<typename T> __global__ void AddConst_inria_cplx(T *A, T val, int numElements); @@ -65,6 +68,7 @@ template<typename T, typename U> __global__ void Memcpy_inria(T* d_cu_dst, const template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numElements); +template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements); template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1); template<typename T> __global__ void AddSparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1); @@ -77,6 +81,7 @@ template<typename T> __global__ void SetSubmatrix_inria(T* mat_dst, T* mat_src, template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length); +template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length); template <typename T> __global__ void sum_reduce(T *g_idata, T *g_odata, unsigned int n); @@ -92,6 +97,15 @@ template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length) Add_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); faust_kernelSafe(); } + +template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length) +{ + int threadsPerBlock = 256; + int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; + Add_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); + faust_kernelSafe(); +} + template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length) { int threadsPerBlock = 256; @@ -106,6 +120,13 @@ template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length) Mult_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); faust_kernelSafe(); } +template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length) +{ + int threadsPerBlock = 256; + int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; + Mult_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); + faust_kernelSafe(); +} template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length) { int threadsPerBlock = 256; @@ -113,7 +134,13 @@ template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length) Div_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); faust_kernelSafe(); } - +template<typename T> void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length) +{ + int threadsPerBlock = 256; + int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; + Div_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length); + faust_kernelSafe(); +} template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length) { int threadsPerBlock = 256; @@ -223,6 +250,14 @@ template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length) faust_kernelSafe(); } +template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length) +{ + int threadsPerBlock = 256; + int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock; + Memset_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu_dst, valeur, length); + faust_kernelSafe(); +} + template<typename T> void kernel_sparse2full(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1, const int src_dim2) { int threadsPerBlock = 256; @@ -284,6 +319,14 @@ template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_t faust_kernelSafe(); } +template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int nb_elements) +{ + int threadsPerBlock = 256; + int blocksPerGrid = (nb_elements + threadsPerBlock - 1) / threadsPerBlock; + RelativeError_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(data_dst, data_src_th, data_src_mes, nb_elements); + faust_kernelSafe(); +} + template <typename T> void kernel_sum_reduce(T *g_idata, T *g_odata, unsigned int n) { int threadsPerBlock = 256; @@ -324,6 +367,19 @@ template<typename T> __global__ void Add_inria(T* A, const T* B, int numElements i += gridDim.x * blockDim.x; } } + +template<typename T> __global__ void Add_inria_cplx(T* A, const T* B, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < numElements) + { + A[i].x = A[i].x + B[i].x; + A[i].y = A[i].y + B[i].y; + i += gridDim.x * blockDim.x; + } +} + template<typename T> __global__ void Sub_inria(T* A, const T* B, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -341,9 +397,22 @@ template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElement while (i < numElements) { A[i] = A[i] * B[i]; - i += gridDim.x * blockDim.x; + i += gridDim.x * blockDim.x; } } + +template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < numElements) + { + A[i].x = A[i].x * B[i].x; + A[i].y = A[i].y * B[i].y; + i += gridDim.x * blockDim.x; + } +} + template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -355,6 +424,18 @@ template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements } } +template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < numElements) + { + A[i].x = A[i].x / B[i].x; + A[i].y = A[i].y / B[i].y; + i += gridDim.x * blockDim.x; + } +} + template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -508,6 +589,18 @@ template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numE } } +template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + while (i < numElements) + { + dev_dst[i].x = valeur.x ; + dev_dst[i].y = valeur.y ; + i += gridDim.x * blockDim.x; + } +} + template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1) { int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -603,6 +696,17 @@ template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* d } +template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + while(i<numElements) + { + data_dst[i].x = fabs((data_src_th[i].x-data_src_mes[i].x)/data_src_th[i].x); + data_dst[i].y = fabs((data_src_th[i].y-data_src_mes[i].y)/data_src_th[i].y); + i += gridDim.x * blockDim.x; + } +} + /*template<typename T> __global__ void SetLinfAR1(T* mat_Linf, T* mat_Hinf, T* atur, int nb_n, int nb_elements) { int ind = blockDim.x*blockIdx.x+threadIdx.x; diff --git a/src/kernels.h b/src/kernels.h index e27108d..702a085 100644 --- a/src/kernels.h +++ b/src/kernels.h @@ -49,12 +49,15 @@ // for k=0 to (length-1), d_cu1[k] += d_cu2[k] template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length); +template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length); // for k=0 to (length-1), d_cu1[k] -= d_cu2[k] template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length); // for k=0 to (length-1), d_cu1[k] *= d_cu2[k] template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length); +template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length); // for k=0 to (length-1), d_cu1[k] /= d_cu2[k] template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length); +template<typename T> void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length); // for k=0 to (length-1), d_cu1[k] += valeur template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length); @@ -83,6 +86,7 @@ template<typename T> void kernel_abs_cplx(T* d_cu1, const int length); template<typename T, typename U> void kernel_memcpy(T* d_cu_dst, const U* d_cu_src, int length); // for k=0 to (length-1), d_cu_dst[k] = valeur template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length); +template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length); // convert sparse matrix (nnz, dev_src_rowind, dev_src_colind, dev_src_values) in 1-based indexing to full matrix dev_dst // dev_dst=0; for k=0 to (nnz-1), dev_dst[(dev_src_colind[k]-1)*src_dim1+(dev_src_rowind[k]-1)] = dev_src_values[k] @@ -107,6 +111,7 @@ template<typename T> void kernel_set_submatrix(T* mat_dst, T* mat_src, int src_d // for k=0 to (length-1 )data_dst[i] = abs((data_src_th[i]-data_src_mes[i])/data_src_th[i]); template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length); +template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length); -- GitLab