From 8ca251cc858c3bb6a746a8cf6965e5657d92f39f Mon Sep 17 00:00:00 2001
From: hhadjdji <hhadjdji hakim.hadj-djilani@inria.fr>
Date: Mon, 28 Sep 2020 08:52:58 +0200
Subject: [PATCH] Add new operations to cuMatDs and link them into the C
 interface.

- Matrix element-wise multiplication and division (elt_wise_mul/div).
- Matrix initialization of all coefficients to a constant (set_val).
- Matrix sum of all coefficients (sum).
- Matrix mean relative error (mean_relerr) against a reference matrix.
- New corresponding kernel specializations for cuda complex types.
---
 src/cuMatDs.cpp.in       |  51 ++++++++++++++++++
 src/cuMatDs.h            |   6 ++-
 src/cuMatDs.hpp          |  11 +++-
 src/gm.cpp.in            |  25 +++++++++
 src/gm_interf_gen.h.in   |  18 +++++--
 src/gm_interf_gen.hpp.in |   5 ++
 src/kernel_def.hu        |  21 ++++++++
 src/kernels.cu           | 108 ++++++++++++++++++++++++++++++++++++++-
 src/kernels.h            |   5 ++
 9 files changed, 242 insertions(+), 8 deletions(-)

diff --git a/src/cuMatDs.cpp.in b/src/cuMatDs.cpp.in
index 79f1d6d..69d0d70 100644
--- a/src/cuMatDs.cpp.in
+++ b/src/cuMatDs.cpp.in
@@ -26,3 +26,54 @@ void cuMatDs<@GM_SCALAR@>::abs()
 #endif
 	switch_back();
 }
+
+template<>
+	void cuMatDs<@GM_SCALAR@>::elt_wise_div(const cuMatDs<@GM_SCALAR@>& M)
+{
+	if(nrows != M.nrows || ncols != M.ncols)
+		throw std::runtime_error("Dimensions must agree.");
+#if(IS_COMPLEX==1)
+	kernel_div_cplx(data, M.data, numel());
+#else
+	kernel_div(data, M.data, numel());
+#endif
+}
+
+template<>
+	void cuMatDs<@GM_SCALAR@>::elt_wise_mul(const cuMatDs<@GM_SCALAR@>& M)
+{
+	if(nrows != M.nrows || ncols != M.ncols)
+		throw std::runtime_error("Dimensions must agree.");
+#if(IS_COMPLEX==1)
+	kernel_mult_cplx(data, M.data, numel());
+#else
+	kernel_mult(data, M.data, numel());
+#endif
+}
+
+template<>
+	void cuMatDs<@GM_SCALAR@>::set_val(const @GM_SCALAR@& val)
+{
+#if(IS_COMPLEX==1)
+	kernel_memset_cplx(data, val, numel());
+#else
+	kernel_memset(data, val, numel());
+#endif
+}
+
+template<>
+@GM_SCALAR@ cuMatDs<@GM_SCALAR@>::mean_relerr(const cuMatDs<@GM_SCALAR@>& ref)
+{
+	if(ref.nrows != nrows || ref.ncols != ncols)
+		throw std::runtime_error("Dimensions must agree.");
+	cuMatDs<@GM_SCALAR@>* rel_err = cuMatDs<@GM_SCALAR@>::create(nrows, ncols);
+#if(IS_COMPLEX==1)
+	kernel_relative_error_cplx(rel_err->data, ref.data, data, numel());
+#else
+	kernel_relative_error(rel_err->data, ref.data, data, numel());
+#endif
+	auto s = rel_err->sum();
+	auto err = s/Real<@GM_SCALAR@>(numel());
+	delete rel_err;
+	return err;
+}
diff --git a/src/cuMatDs.h b/src/cuMatDs.h
index 093ac9a..ebde195 100644
--- a/src/cuMatDs.h
+++ b/src/cuMatDs.h
@@ -46,9 +46,10 @@ struct cuMatDs : cuMat<T>
 	cuMatDs<T>* mul(cuMatDs<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
 	void mul(cuMatDs<T>&, hMatDs<T>& output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
 	void mul(cuMatDs<T>&, T* output, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
-
 	cuMatDs<T>* mul(cuMatSp<T>&, cuMatDs<T>* output = nullptr, gm_Op op_this = OP_NOTRANSP, gm_Op op_other = OP_NOTRANSP);
 	void mul(const T& scalar);
+	void elt_wise_mul(const cuMatDs<T>&);
+	void elt_wise_div(const cuMatDs<T>&);
 	void apply_op(const gm_Op op);
 	static cuMatDs<T>* apply_op(const cuMatDs<T>* in, gm_Op op, cuMatDs<T>* out = nullptr);
 	void transpose();
@@ -78,12 +79,15 @@ struct cuMatDs : cuMat<T>
 	void setOnes();
 	void set_zeros();
 	void set_eyes();
+	void set_val(const T& val);
 	cuMatDs<T>* clone(int32_t dev=-1);
 	void copy(cuMatDs<T>*) const;
 	void resize(const int32_t nrows, const int32_t ncols);
 	T trace();
 	T coeff_max();
 	T coeff_min();
+	T sum();
+	T mean_relerr(const cuMatDs<T>&);
 	void abs();
 	void destroy();
 	void set_stream(const void* stream);
diff --git a/src/cuMatDs.hpp b/src/cuMatDs.hpp
index 169d96b..43895f0 100644
--- a/src/cuMatDs.hpp
+++ b/src/cuMatDs.hpp
@@ -305,7 +305,6 @@ cuMatDs<T>* cuMatDs<T>::mul(cuMatSp<T>& other, cuMatDs<T>* output /* = nullptr*/
 	k = other.ncols;
 	set_one(&alpha);
 	bzero(&beta, sizeof(T));
-	//TODO: refactor to avoid calling cusparseTcsrmm2 for each different cases (set parameters)
 	if(op_this == op_other && op_this == OP_NOTRANSP)
 	{
 		n = this->nrows;
@@ -703,6 +702,16 @@ T cuMatDs<T>::coeff_min()
 	return min;
 }
 
+template<typename T>
+T cuMatDs<T>::sum()
+{
+	auto switch_back = switch_dev(this->dev_id);
+	auto len = numel();
+	auto s = faust_cu_sum(data, len);
+	switch_back();
+	return s;
+}
+
 template<typename T>
 T cuMatDs<T>::trace()
 {
diff --git a/src/gm.cpp.in b/src/gm.cpp.in
index 5fc628e..1ec9879 100644
--- a/src/gm.cpp.in
+++ b/src/gm.cpp.in
@@ -4,6 +4,26 @@ extern "C"
 {
 	//TODO: separate in multiple files (one at least per type gm_DenseMat_t, gm_SparseMat_t, gm_MatArray_t)
 
+	__DYNLIB_ATTR__ void gm_DenseMat_elt_wise_div_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b)
+	{
+		static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_div(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
+	}
+
+	__DYNLIB_ATTR__ void gm_DenseMat_setval_@GM_SCALAR@(gm_DenseMat_t a, @GM_SCALAR@* v)
+	{
+		static_cast<cuMatDs<@GM_SCALAR@>*>(a)->set_val(*v);
+	}
+
+	__DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_mean_relerr_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b, @GM_SCALAR@* mre)
+	{
+		*mre = static_cast<cuMatDs<@GM_SCALAR@>*>(a)->mean_relerr(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
+	}
+
+	__DYNLIB_ATTR__ void gm_DenseMat_elt_wise_mul_@GM_SCALAR@(gm_DenseMat_t a, gm_DenseMat_t b)
+	{
+		static_cast<cuMatDs<@GM_SCALAR@>*>(a)->elt_wise_mul(*static_cast<cuMatDs<@GM_SCALAR@>*>(b));
+	}
+
 	__DYNLIB_ATTR__ void gm_DenseMat_setzeros_@GM_SCALAR@(gm_DenseMat_t src_mat)
 	{
 		static_cast<cuMatDs<@GM_SCALAR@>*>(src_mat)->set_zeros();
@@ -220,6 +240,11 @@ extern "C"
 		return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->norm_l1();
 	}
 
+	__DYNLIB_ATTR__ void gm_DenseMat_sum_@GM_SCALAR@(gm_DenseMat_t mat, @GM_SCALAR@* sum)
+	{
+		*sum = static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->sum();
+	}
+
 	__DYNLIB_ATTR__ @GM_SCALAR@ gm_DenseMat_power_iteration_@GM_SCALAR@(gm_DenseMat_t mat, float threshold, int32_t max_iter)
 	{
 		return static_cast<cuMatDs<@GM_SCALAR@>*>(mat)->power_iteration(threshold, max_iter);
diff --git a/src/gm_interf_gen.h.in b/src/gm_interf_gen.h.in
index 7c500b3..dce59c0 100644
--- a/src/gm_interf_gen.h.in
+++ b/src/gm_interf_gen.h.in
@@ -90,13 +90,13 @@ typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_l1_@GM_SCALAR@_ptr)(gm_DenseMat_t);
 typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_frob_@GM_SCALAR@_ptr)(gm_DenseMat_t);
 /** Computes the spectral norm */
 typedef @GM_SCALAR_REAL@ (*gm_DenseMat_norm_spectral_@GM_SCALAR@_ptr)(gm_DenseMat_t, float threshold, int32_t max_iter);
-/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense).*/
+/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses).*/
 typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
-/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) -- extended version (ops and output matrix).*/
+/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) -- extended version (ops and output matrix).*/
 typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, gm_DenseMat_t output, gm_Op, gm_Op);
-/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer.*/
+/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer.*/
 typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out);
-/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are dense) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/
+/** Multiplies the first argument GPU matrix to the second argument GPU matrix (both are denses) and outputs the result in a CPU buffer -- extended version (gm_Op applied to matrices).*/
 typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_dsm_tocpu_ext_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* out, gm_Op, gm_Op);
 /** Multiplies the first argument GPU dense matrix to the second argument GPU sparse matrix. */
 typedef gm_DenseMat_t (*gm_DenseMat_mul_gpu_spm_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_SparseMat_t);
@@ -126,6 +126,11 @@ typedef void (*gm_DenseMat_mv_to_gpu_@GM_SCALAR@_ptr)(gm_DenseMat_t, int32_t dev
 typedef void (*gm_DenseMat_gemm_@GM_SCALAR@_ptr)(const gm_DenseMat_t, const gm_DenseMat_t, gm_DenseMat_t, const @GM_SCALAR@ *alpha, const @GM_SCALAR@ *beta, const gm_Op, const gm_Op);
 typedef int32_t (*gm_DenseMat_get_dev_@GM_SCALAR@_ptr)(gm_DenseMat_t);
 typedef size_t (*gm_DenseMat_get_nnz_@GM_SCALAR@_ptr)(gm_DenseMat_t);
+typedef void (*gm_DenseMat_sum_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* sum);
+typedef void (*gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
+typedef void (*gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t);
+typedef void (*gm_DenseMat_setval_@GM_SCALAR@_ptr)(gm_DenseMat_t, @GM_SCALAR@* v);
+typedef @GM_SCALAR@ (*gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr)(gm_DenseMat_t, gm_DenseMat_t, @GM_SCALAR@* mre);
 
 struct gm_DenseMatFunc_@GM_SCALAR@
 {
@@ -168,6 +173,7 @@ struct gm_DenseMatFunc_@GM_SCALAR@
 	gm_DenseMat_setones_@GM_SCALAR@_ptr setones;
 	gm_DenseMat_setzeros_@GM_SCALAR@_ptr setzeros;
 	gm_DenseMat_seteyes_@GM_SCALAR@_ptr seteyes;
+	gm_DenseMat_setval_@GM_SCALAR@_ptr setval;
 	gm_DenseMat_transpose_@GM_SCALAR@_ptr transpose;
 	gm_DenseMat_conjugate_@GM_SCALAR@_ptr conjugate;
 	gm_DenseMat_abs_@GM_SCALAR@_ptr abs;
@@ -176,6 +182,10 @@ struct gm_DenseMatFunc_@GM_SCALAR@
 	gm_DenseMat_gemm_@GM_SCALAR@_ptr gemm;
 	gm_DenseMat_get_dev_@GM_SCALAR@_ptr get_dev;
 	gm_DenseMat_get_nnz_@GM_SCALAR@_ptr get_nnz;
+	gm_DenseMat_sum_@GM_SCALAR@_ptr sum;
+	gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr elt_wise_mul;
+	gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr elt_wise_div;
+	gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr mean_relerr;
 };
 
 /** MatArray function pointers ********************************************************************/
diff --git a/src/gm_interf_gen.hpp.in b/src/gm_interf_gen.hpp.in
index 1c76aff..5895680 100644
--- a/src/gm_interf_gen.hpp.in
+++ b/src/gm_interf_gen.hpp.in
@@ -78,6 +78,11 @@ void load_dsm_funcs_@GM_SCALAR@(void* gm_handle, gm_DenseMatFunc_@GM_SCALAR@ *ds
 	 dsm_funcs->gemm = (gm_DenseMat_gemm_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle,"gm_DenseMat_gemm_@GM_SCALAR@");
 	 dsm_funcs->get_dev = (gm_DenseMat_get_dev_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_dev_@GM_SCALAR@");
 	 dsm_funcs->get_nnz = (gm_DenseMat_get_nnz_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_get_nnz_@GM_SCALAR@");
+	 dsm_funcs->sum = (gm_DenseMat_sum_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_sum_@GM_SCALAR@");
+	 dsm_funcs->elt_wise_mul = (gm_DenseMat_elt_wise_mul_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_mul_@GM_SCALAR@");
+	 dsm_funcs->elt_wise_div = (gm_DenseMat_elt_wise_div_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_elt_wise_div_@GM_SCALAR@");
+	 dsm_funcs->setval = (gm_DenseMat_setval_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_setval_@GM_SCALAR@");
+	 dsm_funcs->mean_relerr = (gm_DenseMat_mean_relerr_@GM_SCALAR@_ptr) gm_load_func_check_err(gm_handle, "gm_DenseMat_mean_relerr_@GM_SCALAR@");
 }
 
 void load_marr_funcs_@GM_SCALAR@(void* gm_handle, gm_MatArrayFunc_@GM_SCALAR@ *marr_funcs)
diff --git a/src/kernel_def.hu b/src/kernel_def.hu
index 8521d17..ab4e90c 100644
--- a/src/kernel_def.hu
+++ b/src/kernel_def.hu
@@ -43,7 +43,9 @@
 template void kernel_add<float>(float* d_cu1, const float* d_cu2, int length);
 template void kernel_sub<float>(float* d_cu1, const float* d_cu2, int length);
 template void kernel_mult<float>(float* d_cu1, const float* d_cu2, int length);
+template void kernel_mult_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
 template void kernel_div<float>(float* d_cu1, const float* d_cu2, int length);
+template void kernel_div_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
 template void kernel_add_const<float>(float* d_cu1, float valeur, int length);
 template void kernel_add_const_cplx<float2>(float2* d_cu1, float2 valeur, int length);
 template void kernel_sub_const<float>(float* d_cu1, float valeur, int length);
@@ -58,6 +60,7 @@ template void kernel_abs<float>(float* d_cu1, const int length);
 template void kernel_abs_cplx<float2>(float2* d_cu1, const int length);
 template void kernel_memcpy<float,float>(float* d_cu_dst, const float* d_cu_src, int length);
 template void kernel_memset<float>(float* d_cu_dst, float valeur, int length);
+template void kernel_memset_cplx<float2>(float2* d_cu_dst, float2 valeur, int length);
 template void kernel_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1, const int src_dim2);
 template void kernel_add_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
 template void kernel_sub_sparse2full<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
@@ -67,11 +70,14 @@ template void kernel_add_diag_const<float>(float* d_cu1, float val, int dim1);
 template void kernel_copy_diag<float>(float* d_cu_dst, float* d_cu_src, int dim1);
 template void kernel_set_submatrix<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col);
 template void kernel_relative_error<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length);
+template void kernel_relative_error_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length);
 
 template void kernel_add<double>(double* d_cu1, const double* d_cu2, int length);
 template void kernel_sub<double>(double* d_cu1, const double* d_cu2, int length);
 template void kernel_mult<double>(double* d_cu1, const double* d_cu2, int length);
+template void kernel_mult_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
 template void kernel_div<double>(double* d_cu1, const double* d_cu2, int length);
+template void kernel_div_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
 template void kernel_add_const<double>(double* d_cu1, double valeur, int length);
 template void kernel_add_const_cplx<double2>(double2* d_cu1, double2 valeur, int length);
 template void kernel_sub_const<double>(double* d_cu1, double valeur, int length);
@@ -86,6 +92,7 @@ template void kernel_abs<double>(double* d_cu1, const int length);
 template void kernel_abs_cplx<double2>(double2* d_cu1, const int length);
 template void kernel_memcpy<double,double>(double* d_cu_dst, const double* d_cu_src, int length);
 template void kernel_memset<double>(double* d_cu_dst, double valeur, int length);
+template void kernel_memset_cplx<double2>(double2* d_cu_dst, double2 valeur, int length);
 template void kernel_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1, const int src_dim2);
 template void kernel_add_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
 template void kernel_sub_sparse2full<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
@@ -95,6 +102,7 @@ template void kernel_add_diag_const<double>(double* d_cu1, double val, int dim1)
 template void kernel_copy_diag<double>(double* d_cu_dst, double* d_cu_src, int dim1);
 template void kernel_set_submatrix<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_col);
 template void kernel_relative_error<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length);
+template void kernel_relative_error_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length);
 
 template void kernel_add<int>(int* d_cu1, const int* d_cu2, int length);
 template void kernel_sub<int>(int* d_cu1, const int* d_cu2, int length);
@@ -115,6 +123,8 @@ template void kernel_set_submatrix<int>(int* mat_dst, int* mat_src, int src_dim1
 template void kernel_memcpy<double,float>(double* d_cu_dst, const float* d_cu_src, int length);
 template void kernel_memcpy<float,double>(float* d_cu_dst, const double* d_cu_src, int length);
 
+template void kernel_add_cplx<float2>(float2* d_cu1, const float2* d_cu2, int length);
+template void kernel_add_cplx<double2>(double2* d_cu1, const double2* d_cu2, int length);
 
 
 
@@ -133,11 +143,13 @@ template __global__ void Inv_inria<float>(float* A, int numElements);
 template __global__ void Abs_inria<float>(float* A, int numElements);
 template __global__ void Memcpy_inria<float,float>(float* d_cu_dst, const float* d_cu_src, int length);
 template __global__ void Memset_inria<float>(float* dev_dst, float valeur, int numElements);
+template __global__ void Memset_inria_cplx<float2>(float2* dev_dst, float2 valeur, int numElements);
 template __global__ void Sparse2full_inria<float>(float* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const float* dev_src_values, const int nnz, const int src_dim1);
 template __global__ void AddDiagConst_inria<float>(float* dev_dst, float val, int dim1);
 template __global__ void GetDiag_inria<float>(float* dst_diag, const float* src_M,  int dim1, int dlen);
 template __global__ void SetSubmatrix_inria<float>(float* mat_dst, float* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements);
 template __global__ void RelativeError_inria<float>(float* data_dst, const float* data_src_th, const float* data_src_mes, const int length);
+template __global__ void RelativeError_inria_cplx<float2>(float2* data_dst, const float2* data_src_th, const float2* data_src_mes, const int length);
 
 template __global__ void Add_inria<double>(double* A, const double *B, int numElements);
 template __global__ void Sub_inria<double>(double* A, const double *B, int numElements);
@@ -152,11 +164,13 @@ template __global__ void Inv_inria<double>(double* A, int numElements);
 template __global__ void Abs_inria<double>(double* A, int numElements);
 template __global__ void Memcpy_inria<double,double>(double* d_cu_dst, const double* d_cu_src, int length);
 template __global__ void Memset_inria<double>(double* dev_dst, double valeur, int numElements);
+template __global__ void Memset_inria_cplx<double2>(double2* dev_dst, double2 valeur, int numElements);
 template __global__ void Sparse2full_inria<double>(double* dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const double* dev_src_values, const int nnz, const int src_dim1);
 template __global__ void AddDiagConst_inria<double>(double* dev_dst, double val, int dim1);
 template __global__ void GetDiag_inria<double>(double* dst_diag, const double* src_M,  int dim1, int dlen);
 template __global__ void SetSubmatrix_inria<double>(double* mat_dst, double* mat_src, int src_dim1, int r1, int c1, int nb_rows, int nb_elements);
 template __global__ void RelativeError_inria<double>(double* data_dst, const double* data_src_th, const double* data_src_mes, const int length);
+template __global__ void RelativeError_inria_cplx<double2>(double2* data_dst, const double2* data_src_th, const double2* data_src_mes, const int length);
 
 template __global__ void Add_inria<int>(int* A, const int *B, int numElements);
 template __global__ void Sub_inria<int>(int* A, const int *B, int numElements);
@@ -184,3 +198,10 @@ template __global__ void finish_min_max_reduce<double2>(double2 *g_idata, double
 
 template __global__ void Abs_inria_cplx<double2>(double2* A, int numElements);
 template __global__ void Abs_inria_cplx<float2>(float2* A, int numElements);
+
+template __global__ void Add_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
+template __global__ void Add_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
+template __global__ void Mult_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
+template __global__ void Mult_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
+template __global__ void Div_inria_cplx<float2>(float2* A, const float2 *B, int numElements);
+template __global__ void Div_inria_cplx<double2>(double2* A, const double2 *B, int numElements);
diff --git a/src/kernels.cu b/src/kernels.cu
index 2d66254..56de63e 100644
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -43,9 +43,12 @@
 
 //prototypes
 template<typename T> __global__ void Add_inria(T *A, const T *B, int numElements);
+template<typename T> __global__ void Add_inria_cplx(T *A, const T *B, int numElements);
 template<typename T> __global__ void Sub_inria(T *A, const T *B, int numElements);
 template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElements);
+template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements);
 template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements);
+template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements);
 
 template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements);
 template<typename T> __global__ void AddConst_inria_cplx(T *A, T val, int numElements);
@@ -65,6 +68,7 @@ template<typename T, typename U> __global__ void Memcpy_inria(T* d_cu_dst, const
 
 
 template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numElements);
+template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements);
 
 template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1);
 template<typename T> __global__ void AddSparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1);
@@ -77,6 +81,7 @@ template<typename T> __global__ void SetSubmatrix_inria(T* mat_dst, T* mat_src,
 
 
 template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
+template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
 
 template <typename T> __global__ void sum_reduce(T *g_idata, T *g_odata, unsigned int n);
 
@@ -92,6 +97,15 @@ template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length)
 	Add_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
    faust_kernelSafe();
 }
+
+template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length)
+{
+	int threadsPerBlock = 256;
+	int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
+	Add_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
+   faust_kernelSafe();
+}
+
 template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length)
 {
 	int threadsPerBlock = 256;
@@ -106,6 +120,13 @@ template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length)
 	Mult_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
    faust_kernelSafe();
 }
+template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length)
+{
+	int threadsPerBlock = 256;
+	int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
+	Mult_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
+   faust_kernelSafe();
+}
 template<typename T>  void kernel_div(T* d_cu1, const T* d_cu2, int length)
 {
 	int threadsPerBlock = 256;
@@ -113,7 +134,13 @@ template<typename T>  void kernel_div(T* d_cu1, const T* d_cu2, int length)
 	Div_inria<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
    faust_kernelSafe();
 }
-
+template<typename T>  void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length)
+{
+	int threadsPerBlock = 256;
+	int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
+	Div_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu1, d_cu2, length);
+	faust_kernelSafe();
+}
 template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length)
 {
 	int threadsPerBlock = 256;
@@ -223,6 +250,14 @@ template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length)
    faust_kernelSafe();
 }
 
+template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length)
+{
+	int threadsPerBlock = 256;
+	int blocksPerGrid = (length + threadsPerBlock - 1) / threadsPerBlock;
+	Memset_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(d_cu_dst, valeur, length);
+   faust_kernelSafe();
+}
+
 template<typename T> void kernel_sparse2full(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1, const int src_dim2)
 {
 	int threadsPerBlock = 256;
@@ -284,6 +319,14 @@ template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_t
    faust_kernelSafe();
 }
 
+template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int nb_elements)
+{
+	int threadsPerBlock = 256;
+	int blocksPerGrid = (nb_elements + threadsPerBlock - 1) / threadsPerBlock;
+	RelativeError_inria_cplx<T><<<blocksPerGrid,threadsPerBlock>>>(data_dst, data_src_th, data_src_mes, nb_elements);
+	faust_kernelSafe();
+}
+
 template <typename T> void kernel_sum_reduce(T *g_idata, T *g_odata, unsigned int n)
 {
 	int threadsPerBlock = 256;
@@ -324,6 +367,19 @@ template<typename T> __global__ void Add_inria(T* A, const T* B, int numElements
 	i += gridDim.x * blockDim.x;
     }
 }
+
+template<typename T> __global__ void Add_inria_cplx(T* A, const T* B, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < numElements)
+	{
+		A[i].x = A[i].x + B[i].x;
+		A[i].y = A[i].y + B[i].y;
+		i += gridDim.x * blockDim.x;
+	}
+}
+
 template<typename T> __global__ void Sub_inria(T* A, const T* B, int numElements)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -341,9 +397,22 @@ template<typename T> __global__ void Mult_inria(T *A, const T *B, int numElement
     while (i < numElements)
     {
         A[i] = A[i] * B[i];
-	i += gridDim.x * blockDim.x;
+		i += gridDim.x * blockDim.x;
     }
 }
+
+template<typename T> __global__ void Mult_inria_cplx(T *A, const T *B, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < numElements)
+	{
+		A[i].x = A[i].x * B[i].x;
+		A[i].y = A[i].y * B[i].y;
+		i += gridDim.x * blockDim.x;
+	}
+}
+
 template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -355,6 +424,18 @@ template<typename T> __global__ void Div_inria(T *A, const T *B, int numElements
     }
 }
 
+template<typename T> __global__ void Div_inria_cplx(T *A, const T *B, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < numElements)
+    {
+        A[i].x = A[i].x / B[i].x;
+        A[i].y = A[i].y / B[i].y;
+	i += gridDim.x * blockDim.x;
+    }
+}
+
 template<typename T> __global__ void AddConst_inria(T *A, T val, int numElements)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -508,6 +589,18 @@ template<typename T> __global__ void Memset_inria(T* dev_dst, T valeur, int numE
     }
 }
 
+template<typename T> __global__ void Memset_inria_cplx(T* dev_dst, T valeur, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    while (i < numElements)
+	{
+		dev_dst[i].x = valeur.x ;
+		dev_dst[i].y = valeur.y ;
+		i += gridDim.x * blockDim.x;
+	}
+}
+
 template<typename T> __global__ void Sparse2full_inria(T *dev_dst, const int *dev_src_rowind, const int *dev_src_colind, const T* dev_src_values, const int nnz, const int src_dim1)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -603,6 +696,17 @@ template<typename T> __global__ void RelativeError_inria(T* data_dst, const T* d
 }
 
 
+template<typename T> __global__ void RelativeError_inria_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    while(i<numElements)
+	{
+		data_dst[i].x = fabs((data_src_th[i].x-data_src_mes[i].x)/data_src_th[i].x);
+		data_dst[i].y = fabs((data_src_th[i].y-data_src_mes[i].y)/data_src_th[i].y);
+		i += gridDim.x * blockDim.x;
+	}
+}
+
 /*template<typename T> __global__ void SetLinfAR1(T* mat_Linf, T* mat_Hinf, T* atur, int nb_n, int nb_elements)
 {
     int ind = blockDim.x*blockIdx.x+threadIdx.x;
diff --git a/src/kernels.h b/src/kernels.h
index e27108d..702a085 100644
--- a/src/kernels.h
+++ b/src/kernels.h
@@ -49,12 +49,15 @@
 
 // for k=0 to (length-1), d_cu1[k] += d_cu2[k]
 template<typename T> void kernel_add(T* d_cu1, const T* d_cu2, int length);
+template<typename T> void kernel_add_cplx(T* d_cu1, const T* d_cu2, int length);
 // for k=0 to (length-1), d_cu1[k] -= d_cu2[k]
 template<typename T> void kernel_sub(T* d_cu1, const T* d_cu2, int length);
 // for k=0 to (length-1), d_cu1[k] *= d_cu2[k]
 template<typename T> void kernel_mult(T* d_cu1, const T* d_cu2, int length);
+template<typename T> void kernel_mult_cplx(T* d_cu1, const T* d_cu2, int length);
 // for k=0 to (length-1), d_cu1[k] /= d_cu2[k]
 template<typename T> void kernel_div(T* d_cu1, const T* d_cu2, int length);
+template<typename T> void kernel_div_cplx(T* d_cu1, const T* d_cu2, int length);
 
 // for k=0 to (length-1), d_cu1[k] += valeur
 template<typename T> void kernel_add_const(T* d_cu1, T valeur, int length);
@@ -83,6 +86,7 @@ template<typename T> void kernel_abs_cplx(T* d_cu1, const int length);
 template<typename T, typename U> void kernel_memcpy(T* d_cu_dst, const U* d_cu_src, int length);
 // for k=0 to (length-1), d_cu_dst[k] = valeur
 template<typename T> void kernel_memset(T* d_cu_dst, T valeur, int length);
+template<typename T> void kernel_memset_cplx(T* d_cu_dst, T valeur, int length);
 
 // convert sparse matrix (nnz, dev_src_rowind, dev_src_colind, dev_src_values) in 1-based indexing to full matrix dev_dst
 // dev_dst=0; for k=0 to (nnz-1), dev_dst[(dev_src_colind[k]-1)*src_dim1+(dev_src_rowind[k]-1)] = dev_src_values[k]
@@ -107,6 +111,7 @@ template<typename T> void kernel_set_submatrix(T* mat_dst, T* mat_src, int src_d
 
 // for k=0 to (length-1 )data_dst[i] = abs((data_src_th[i]-data_src_mes[i])/data_src_th[i]);
 template<typename T> void kernel_relative_error(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
+template<typename T> void kernel_relative_error_cplx(T* data_dst, const T* data_src_th, const T* data_src_mes, const int length);
 
 
 
-- 
GitLab