Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8ca251cc authored by hhadjdji's avatar hhadjdji
Browse files

Add new operations to cuMatDs and link them into the C interface.

- Matrix element-wise multiplication and division (elt_wise_mul/div).
- Matrix initialization of all coefficients to a constant (set_val).
- Matrix sum of all coefficients (sum).
- Matrix mean relative error (mean_relerr) against a reference matrix.
- New corresponding kernel specializations for cuda complex types.
parent 42a826d6
Branches
Tags
No related merge requests found
......@@ -26,3 +26,54 @@ void cuMatDs<@GM_SCALAR@>::abs()
#endif
switch_back();
}
template<>
void cuMatDs<@GM_SCALAR@>::elt_wise_div(const cuMatDs<@GM_SCALAR@>& M)
{
if(nrows != M.nrows || ncols != M.ncols)
throw std::runtime_error("Dimensions must agree.");
#if(IS_COMPLEX==1)
kernel_div_cplx(data, M.data, numel());
#else
kernel_div(data, M.data, numel());
#endif
}
template<>
void cuMatDs<@GM_SCALAR@>::elt_wise_mul(const cuMatDs<@GM_SCALAR@>& M)
{
if(nrows != M.nrows || ncols != M.ncols)
throw std::runtime_error("Dimensions must agree.");
#if(IS_COMPLEX==1)
kernel_mult_cplx(data, M.data, numel());
#else
kernel_mult(data, M.data, numel());
#endif
}
template<>
void cuMatDs<@GM_SCALAR@>::set_val(const @GM_SCALAR@& val)
{
#if(IS_COMPLEX==1)
kernel_memset_cplx(data, val, numel());
#else
kernel_memset(data, val, numel());
#endif
}
template<>
@GM_SCALAR@ cuMatDs<@GM_SCALAR@>::mean_relerr(const cuMatDs<@GM_SCALAR@>& ref)
{
if(ref.nrows != nrows || ref.ncols != ncols)
throw std::runtime_error("Dimensions must agree.");
cuMatDs<@GM_SCALAR@>* rel_err = cuMatDs<@GM_SCALAR@>::create(nrows, ncols);
#if(IS_COMPLEX==1)
kernel_relative_error_cplx(rel_err->data, ref.data, data, numel());
#else
kernel_relative_error(rel_err->data, ref.data, data, numel());
#endif
auto s = rel_err->sum();
auto err = s/Real<@GM_SCALAR@>(numel());
delete rel_err;
return err;
}
......@@ -46,9 +46,10 @@ struct cuMatDs : cuMat<T>
cuMatDs<T>* mul(cuMatDs<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
void mul(cuMatDs<T>&, hMatDs<T>& output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
void mul(cuMatDs<T>&, T* output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
cuMatDs<T>* mul(cuMatSp<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
void mul(const T& scalar);
void elt_wise_mul(const cuMatDs<T>&);
void elt_wise_div(const cuMatDs<T>&);
void apply_op(const gm_Op op);
static cuMatDs<T>* apply_op(const cuMatDs<T>* in, gm_Op op, cuMatDs<T>* out = nullptr);
void transpose();
......@@ -78,12 +79,15 @@ struct cuMatDs : cuMat<T>
void setOnes();
void set_zeros();
void set_eyes();
void set_val(const T& val);
cuMatDs<T>* clone(int32_t dev=-1);
void copy(cuMatDs<T>*) const;
void resize(const int32_t nrows, const int32_t ncols);
T trace();
T coeff_max();
T coeff_min();
T sum();
T mean_relerr(const cuMatDs<T>&);
void abs();
void destroy();
void set_stream(const void* stream);
......
......@@ -305,7 +305,6 @@ cuMatDs<T>* cuMatDs<T>::mul(cuMatSp<T>& other, cuMatDs<T>* output /* = nullptr*/
k = other.ncols;
set_one(&alpha);
bzero(&beta, sizeof(T));
//TODO: refactor to avoid calling cusparseTcsrmm2 for each different cases (set parameters)
if(op_this == op_other && op_this == OP_NOTRANSP)
{
n = this->nrows;
......@@ -703,6 +702,16 @@ T cuMatDs<T>::coeff_min()
return min;
}
template<typename T>
T cuMatDs<T>::sum()
{
auto switch_back = switch_dev(this->dev_id);
auto len = numel();
auto s = faust_cu_sum(data, len);
switch_back();
return s;
}
template<typename T>
T cuMatDs<T>::trace()
{
......
......@@ -4,6 +4,26 @@ extern "C"
{
//TODO: separate in multiple files (one at least per type gm_DenseMat_t, gm_SparseMat_t, gm_MatArray_t)
__DYNLIB_ATTR__ void gm_DenseMat_elt_wise_div_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b)
{
static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_div(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
}
__DYNLIB_ATTR__ void gm_DenseMat_setval_@GM_SCALAR@(gm_DenseMat_t a, @GM_SCALAR@* v)
{
static_cast<cuMatDs<@GM_SCALAR@>*>(a)->set_val(*v);
}
__DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_mean_relerr_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b, @GM_SCALAR@* mre)
{
*mre = static_cast<cuMatDs<@GM_SCALAR@>*>(a)->mean_relerr(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
}
__DYNLIB_ATTR__ void gm_DenseMat_elt_wise_mul_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b)
{
static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_mul(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
}
__DYNLIB_ATTR__ void gm_DenseMat_setzeros_@GM_SCALAR@(gm_DenseMat_t src_mat)
{
static_cast<cuMatDs<@GM_SCALAR@>*>(src_mat)->set_zeros();
......@@ -220,6 +240,11 @@ extern "C"
return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->norm_l1();
}
__DYNLIB_ATTR__ void gm_DenseMat_sum_@GM_SCALAR@(gm_DenseMat_t mat, @GM_SCALAR@* sum)
{
*sum = static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->sum();
}
__DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_power_iteration_@GM_SCALAR@(gm_DenseMat_t mat, float threshold, int32_t max_iter)
{
return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->power_iteration(threshold, max_iter);
......
......@@ -90,13 +90,13 @@ typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_l1_@GM_SCALAR@_ptr)(gm_DenseMat_t);
typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_frob_@GM_SCALAR@_ptr)(gm_DenseMat_t);
/** Computes the spectral norm */
typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_spectral_@GM_SCALAR@_ptr)(gm_DenseMat_t, float threshold, int32_t max_iter);
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense).*/
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses).*/
typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) -- extended version (ops and output matrix).*/
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) -- extended version (ops and output matrix).*/
typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, gm_DenseMat_t output, gm_Op, gm_Op);
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer.*/
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer.*/
typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out);
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/
/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/
typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out, gm_Op, gm_Op);
/** Multiplies the first argument GPU dense matrix to the second argument GPU sparse matrix. */
typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_spm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_SparseMat_t);
......@@ -126,6 +126,11 @@ typedef void (*gm_DenseMat_mv_to_gpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, int32_t dev
typedef void (*gm_DenseMat_gemm_@GM_SCALAR@_ptr)(const gm_DenseMat_t, const gm_DenseMat_t, gm_DenseMat_t, const @GM_SCALAR@ *alpha, const @GM_SCALAR@ *beta, const gm_Op, const gm_Op);
typedef int32_t (*gm_DenseMat_get_dev_@GM_SCALAR@_ptr)(gm_DenseMat_t);
typedef size_t (*gm_DenseMat_get_nnz_@GM_SCALAR@_ptr)(gm_DenseMat_t);
typedef void (*gm_DenseMat_sum_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* sum);
typedef void (*gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
typedef void (*gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
typedef void (*gm_DenseMat_setval_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* v);
typedef @GM_SCALAR@ (*gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* mre);
struct gm_DenseMatFunc_@GM_SCALAR@
{
......@@ -168,6 +173,7 @@ struct gm_DenseMatFunc_@GM_SCALAR@
gm_DenseMat_setones_@GM_SCALAR@_ptr setones;
gm_DenseMat_setzeros_@GM_SCALAR@_ptr setzeros;
gm_DenseMat_seteyes_@GM_SCALAR@_ptr seteyes;
gm_DenseMat_setval_@GM_SCALAR@_ptr setval;
gm_DenseMat_transpose_@GM_SCALAR@_ptr transpose;
gm_DenseMat_conjugate_@GM_SCALAR@_ptr conjugate;
gm_DenseMat_abs_@GM_SCALAR@_ptr abs;
......@@ -176,6 +182,10 @@ struct gm_DenseMatFunc_@GM_SCALAR@
gm_DenseMat_gemm_@GM_SCALAR@_ptr gemm;
gm_DenseMat_get_dev_@GM_SCALAR@_ptr get_dev;
gm_DenseMat_get_nnz_@GM_SCALAR@_ptr get_nnz;
gm_DenseMat_sum_@GM_SCALAR@_ptr sum;
gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr elt_wise_mul;
gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr elt_wise_div;
gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr mean_relerr;
};
/** MatArray function pointers ********************************************************************/
......
......@@ -78,6 +78,11 @@ void load_dsm_funcs_@GM_SCALAR@(void* gm_handle, gm_DenseMatFunc_@GM_SCALAR@ *ds
dsm_funcs->gemm = (gm_DenseMat_gemm_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle,"gm_DenseMat_gemm_@GM_SCALAR@");
dsm_funcs->get_dev = (gm_DenseMat_get_dev_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_dev_@GM_SCALAR@");
dsm_funcs->get_nnz = (gm_DenseMat_get_nnz_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_nnz_@GM_SCALAR@");
dsm_funcs->sum = (gm_DenseMat_sum_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_sum_@GM_SCALAR@");
dsm_funcs->elt_wise_mul = (gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_mul_@GM_SCALAR@");
dsm_funcs->elt_wise_div = (gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_div_@GM_SCALAR@");
dsm_funcs->setval = (gm_DenseMat_setval_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_setval_@GM_SCALAR@");
dsm_funcs->mean_relerr = (gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_mean_relerr_@GM_SCALAR@");
}
void load_marr_funcs_@GM_SCALAR@(void* gm_handle, gm_MatArrayFunc_@GM_SCALAR@ *marr_funcs)
......
......@@ -43,7 +43,9 @@
template void kernel_add<float>(float* d_cu1, const float* d_cu2, int length);
template void kernel_sub<float>(float* d_cu1, const float* d_cu2, int length);
template void kernel_mult<float>(float* d_cu1, const float* d_cu2, int length);
template void kernel_mult_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
template void kernel_div<float>(float* d_cu1, const float* d_cu2, int length);
template void kernel_div_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
template void kernel_add_const<float>(float* d_cu1, float valeur, int length);
template void kernel_add_const_cplx<float2>(float2* d_cu1, float2 valeur, int length);
template void kernel_sub_const<float>(float* d_cu1, float valeur, int length);
......@@ -58,6 +60,7 @@ template void kernel_abs<float>(float* d_cu1, const int length);
template void kernel_abs_cplx<float2>(float2* d_cu1, const int length);
template void kernel_memcpy<float,float>(float* d_cu_dst, const float* d_cu_src, int length);
template void kernel_memset<float>(float* d_cu_dst, float valeur, int length);
template void kernel_memset_cplx<float2>(float2* d_cu_dst, float2 valeur, int length);
template void kernel_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1, const int src_dim2);
template void kernel_add_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
template void kernel_sub_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
......@@ -67,11 +70,14 @@ template void kernel_add_diag_const<float>(float* d_cu1, float val, int dim1);
template void kernel_copy_diag<float>(float* d_cu_dst, float* d_cu_src, int dim1);
template void kernel_set_submatrix<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col);
template void kernel_relative_error<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length);
template void kernel_relative_error_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length);
template void kernel_add<double>(double* d_cu1, const double* d_cu2, int length);
template void kernel_sub<double>(double* d_cu1, const double* d_cu2, int length);
template void kernel_mult<double>(double* d_cu1, const double* d_cu2, int length);
template void kernel_mult_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
template void kernel_div<double>(double* d_cu1, const double* d_cu2, int length);
template void kernel_div_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
template void kernel_add_const<double>(double* d_cu1, double valeur, int length);
template void kernel_add_const_cplx<double2>(double2* d_cu1, double2 valeur, int length);
template void kernel_sub_const<double>(double* d_cu1, double valeur, int length);
......@@ -86,6 +92,7 @@ template void kernel_abs<double>(double* d_cu1, const int length);
template void kernel_abs_cplx<double2>(double2* d_cu1, const int length);
template void kernel_memcpy<double,double>(double* d_cu_dst, const double* d_cu_src, int length);
template void kernel_memset<double>(double* d_cu_dst, double valeur, int length);
template void kernel_memset_cplx<double2>(double2* d_cu_dst, double2 valeur, int length);
template void kernel_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1, const int src_dim2);
template void kernel_add_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
template void kernel_sub_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
......@@ -95,6 +102,7 @@ template void kernel_add_diag_const<double>(double* d_cu1, double val, int dim1)
template void kernel_copy_diag<double>(double* d_cu_dst, double* d_cu_src, int dim1);
template void kernel_set_submatrix<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col);
template void kernel_relative_error<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length);
template void kernel_relative_error_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length);
template void kernel_add<int>(int* d_cu1, const int* d_cu2, int length);
template void kernel_sub<int>(int* d_cu1, const int* d_cu2, int length);
......@@ -115,6 +123,8 @@ template void kernel_set_submatrix<int>(int* mat_dst, int* mat_src, int src_dim1
template void kernel_memcpy<double,float>(double* d_cu_dst, const float* d_cu_src, int length);
template void kernel_memcpy<float,double>(float* d_cu_dst, const double* d_cu_src, int length);
template void kernel_add_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
template void kernel_add_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
......@@ -133,11 +143,13 @@ template __global__ void Inv_inria<float>(float* A, int numElements);
template __global__ void Abs_inria<float>(float* A, int numElements);
template __global__ void Memcpy_inria<float,float>(float* d_cu_dst, const float* d_cu_src, int length);
template __global__ void Memset_inria<float>(float* dev_dst, float valeur, int numElements);
template __global__ void Memset_inria_cplx<float2>(float2* dev_dst, float2 valeur, int numElements);
template __global__ void Sparse2full_inria<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
template __global__ void AddDiagConst_inria<float>(float* dev_dst, float val, int dim1);
template __global__ void GetDiag_inria<float>(float* dst_diag, const float* src_M, int dim1, int dlen);
template __global__ void SetSubmatrix_inria<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements);
template __global__ void RelativeError_inria<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length);
template __global__ void RelativeError_inria_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length);
template __global__ void Add_inria<double>(double* A, const double *B, int numElements);
template __global__ void Sub_inria<double>(double* A, const double *B, int numElements);
......@@ -152,11 +164,13 @@ template __global__ void Inv_inria<double>(double* A, int numElements);
template __global__ void Abs_inria<double>(double* A, int numElements);
template __global__ void Memcpy_inria<double,double>(double* d_cu_dst, const double* d_cu_src, int length);
template __global__ void Memset_inria<double>(double* dev_dst, double valeur, int numElements);
template __global__ void Memset_inria_cplx<double2>(double2* dev_dst, double2 valeur, int numElements);
template __global__ void Sparse2full_inria<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
template __global__ void AddDiagConst_inria<double>(double* dev_dst, double val, int dim1);
template __global__ void GetDiag_inria<double>(double* dst_diag, const double* src_M, int dim1, int dlen);
template __global__ void SetSubmatrix_inria<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements);
template __global__ void RelativeError_inria<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length);
template __global__ void RelativeError_inria_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length);
template __global__ void Add_inria<int>(int* A, const int *B, int numElements);
template __global__ void Sub_inria<int>(int* A, const int *B, int numElements);
......@@ -184,3 +198,10 @@ template __global__ void finish_min_max_reduce<double2>(double2 *g_idata, double
template __global__ void Abs_inria_cplx<double2>(double2* A, int numElements);
template __global__ void Abs_inria_cplx<float2>(float2* A, int numElements);
template __global__ void Add_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
template __global__ void Add_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
template __global__ void Mult_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
template __global__ void Mult_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
template __global__ void Div_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
template __global__ void Div_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
......@@ -43,9 +43,12 @@
//prototypes
template<typename T> __global__ void Add_inria(T *A, const T *B, int numElements);
template<typename T> __global__ void Add_inria_cplx(T *A, const T *B, int numElements);
template<typename T> __global__ void Sub_inria(T *A, const T *B, int numElements);
template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElements);
template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements);
template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements);
template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements);
template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements);
template<typename T> __global__ void AddConst_inria_cplx(T *A, T val, int numElements);
......@@ -65,6 +68,7 @@ template<typename T, typename U> __global__ void Memcpy_inria(T* d_cu_dst, const
template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numElements);
template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements);
template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1);
template<typename T> __global__ void AddSparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1);
......@@ -77,6 +81,7 @@ template<typename T> __global__ void SetSubmatrix_inria(T* mat_dst, T* mat_src,
template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
template <typename T> __global__ void sum_reduce(T *g_idata, T *g_odata, unsigned int n);
......@@ -92,6 +97,15 @@ template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length)
Add_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length)
{
int threadsPerBlock = 256;
int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
Add_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length)
{
int threadsPerBlock = 256;
......@@ -106,6 +120,13 @@ template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length)
Mult_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length)
{
int threadsPerBlock = 256;
int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
Mult_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length)
{
int threadsPerBlock = 256;
......@@ -113,7 +134,13 @@ template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length)
Div_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length)
{
int threadsPerBlock = 256;
int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
Div_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
faust_kernelSafe();
}
template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length)
{
int threadsPerBlock = 256;
......@@ -223,6 +250,14 @@ template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length)
faust_kernelSafe();
}
template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length)
{
int threadsPerBlock = 256;
int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
Memset_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu_dst, valeur, length);
faust_kernelSafe();
}
template<typename T> void kernel_sparse2full(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1, const int src_dim2)
{
int threadsPerBlock = 256;
......@@ -284,6 +319,14 @@ template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_t
faust_kernelSafe();
}
template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int nb_elements)
{
int threadsPerBlock = 256;
int blocksPerGrid = (nb_elements + threadsPerBlock - 1) / threadsPerBlock;
RelativeError_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(data_dst, data_src_th, data_src_mes, nb_elements);
faust_kernelSafe();
}
template <typename T> void kernel_sum_reduce(T *g_idata, T *g_odata, unsigned int n)
{
int threadsPerBlock = 256;
......@@ -324,6 +367,19 @@ template<typename T> __global__ void Add_inria(T* A, const T* B, int numElements
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void Add_inria_cplx(T* A, const T* B, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
while (i < numElements)
{
A[i].x = A[i].x + B[i].x;
A[i].y = A[i].y + B[i].y;
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void Sub_inria(T* A, const T* B, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -341,9 +397,22 @@ template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElement
while (i < numElements)
{
A[i] = A[i] * B[i];
i += gridDim.x * blockDim.x;
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
while (i < numElements)
{
A[i].x = A[i].x * B[i].x;
A[i].y = A[i].y * B[i].y;
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -355,6 +424,18 @@ template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements
}
}
template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
while (i < numElements)
{
A[i].x = A[i].x / B[i].x;
A[i].y = A[i].y / B[i].y;
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -508,6 +589,18 @@ template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numE
}
}
template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
while (i < numElements)
{
dev_dst[i].x = valeur.x ;
dev_dst[i].y = valeur.y ;
i += gridDim.x * blockDim.x;
}
}
template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -603,6 +696,17 @@ template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* d
}
template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
while(i<numElements)
{
data_dst[i].x = fabs((data_src_th[i].x-data_src_mes[i].x)/data_src_th[i].x);
data_dst[i].y = fabs((data_src_th[i].y-data_src_mes[i].y)/data_src_th[i].y);
i += gridDim.x * blockDim.x;
}
}
/*template<typename T> __global__ void SetLinfAR1(T* mat_Linf, T* mat_Hinf, T* atur, int nb_n, int nb_elements)
{
int ind = blockDim.x*blockIdx.x+threadIdx.x;
......
......@@ -49,12 +49,15 @@
// for k=0 to (length-1), d_cu1[k] += d_cu2[k]
template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length);
template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length);
// for k=0 to (length-1), d_cu1[k] -= d_cu2[k]
template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length);
// for k=0 to (length-1), d_cu1[k] *= d_cu2[k]
template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length);
template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length);
// for k=0 to (length-1), d_cu1[k] /= d_cu2[k]
template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length);
template<typename T> void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length);
// for k=0 to (length-1), d_cu1[k] += valeur
template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length);
......@@ -83,6 +86,7 @@ template<typename T> void kernel_abs_cplx(T* d_cu1, const int length);
template<typename T, typename U> void kernel_memcpy(T* d_cu_dst, const U* d_cu_src, int length);
// for k=0 to (length-1), d_cu_dst[k] = valeur
template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length);
template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length);
// convert sparse matrix (nnz, dev_src_rowind, dev_src_colind, dev_src_values) in 1-based indexing to full matrix dev_dst
// dev_dst=0; for k=0 to (nnz-1), dev_dst[(dev_src_colind[k]-1)*src_dim1+(dev_src_rowind[k]-1)] = dev_src_values[k]
......@@ -107,6 +111,7 @@ template<typename T> void kernel_set_submatrix(T* mat_dst, T* mat_src, int src_d
// for k=0 to (length-1 )data_dst[i] = abs((data_src_th[i]-data_src_mes[i])/data_src_th[i]);
template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment