Implement MatDense<GPU2,FPP>::multiply(MatDense<GPU2,FPP>).

cf8679a7 · hhakim · be554a03 · cf8679a7 · cf8679a7 · cf8679a7
Commit cf8679a7 authored 4 years ago by hhakim
--- a/src/faust_linear_operator/CPU/faust_gpu_mod.h
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod.h
@@ -20,6 +20,7 @@ namespace Faust
 		static RefManager ref_man;
 		static void *gm_handle;
 		static int *gm_users;
+		public:
 		static void* marr_funcs; //void because we don't know FPP yet and templates aren't available through shared lib interface (extern C, no name mangling)
 		static void* dsm_funcs;
 #ifdef _MSC_VER
@@ -85,6 +86,7 @@ namespace Faust
 		static bool are_cpu_mat_all_known(const std::vector<MatGeneric<FPP,Cpu>*> mats);

 		static void* init_gpu_mod(const std::string& libpath = "libgm.so", const bool silent = false, void* gm_handle = nullptr);
+		static void load_gm_functions();
 		static void check_gpu_mod_loaded();
 	};


--- a/src/faust_linear_operator/CPU/faust_gpu_mod.hpp
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod.hpp
@@ -29,7 +29,7 @@ void* Faust::FaustGPU<FPP>::init_gpu_mod(const std::string& libpath, const bool
 {
 	if(Faust::FaustGPU<FPP>::gm_handle == nullptr)
 		if(gm_handle == nullptr)
-			Faust::FaustGPU<FPP>::gm_handle = gm_load_lib(libpath.c_str(), silent);
+			Faust::FaustGPU<FPP>::gm_handle = gm_load_lib(libpath.c_str());
 		else
 			Faust::FaustGPU<FPP>::gm_handle = gm_handle;
 	else

--- a/src/faust_linear_operator/CPU/faust_gpu_mod_complexdouble.hpp
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod_complexdouble.hpp
+#include "faust_TransformHelper.h"
+namespace Faust
+{
+	template <>
+	void FaustGPU<complex<double>>::load_gm_functions()
+	{
+		gm_DenseMatFunc_cuDoubleComplex* dsm_funcs;
+		gm_MatArrayFunc_cuDoubleComplex* marr_funcs;
+		gm_GenPurposeFunc_cuDoubleComplex* gp_funcs;
+
+		if(FaustGPU<complex<double>>::marr_funcs == nullptr)
+		{
+			marr_funcs = new gm_MatArrayFunc_cuDoubleComplex(); // on the heap because because it cannot be shared among FaustGPU instances if on the stack
+			dsm_funcs = new gm_DenseMatFunc_cuDoubleComplex();
+			gp_funcs = new gm_GenPurposeFunc_cuDoubleComplex();
+			load_marr_funcs_cuDoubleComplex(gm_handle, marr_funcs);
+			load_dsm_funcs_cuDoubleComplex(gm_handle, dsm_funcs);
+			load_gp_funcs_cuDoubleComplex(gm_handle, gp_funcs);
+			FaustGPU<complex<double>>::marr_funcs = marr_funcs;
+			FaustGPU<complex<double>>::dsm_funcs = dsm_funcs;
+			FaustGPU<complex<double>>::gp_funcs = gp_funcs;
+		}
+
+	}
+
+
+	template <>
+		MatDense<complex<double>,Cpu> FaustGPU<complex<double>>::get_product(const bool transpose /* = false */, const bool conjugate /* = false */)
+		{
+			gm_Op op = OP_NOTRANSP;
+			if(transpose)
+				if(conjugate)
+					op = OP_CONJTRANSP;
+				else
+					op = OP_TRANSP;
+			gm_DenseMatFunc_cuDoubleComplex* dsm_funcs = (gm_DenseMatFunc_cuDoubleComplex*) this->dsm_funcs;
+			gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			cuDoubleComplex one;
+			set_one<cuDoubleComplex>(&one);
+			auto gpu_prod_mat_dense = marr_funcs->chain_matmul(gpu_mat_arr, one, op);
+			int32_t nrows, ncols;
+			dsm_funcs->info(gpu_prod_mat_dense, &nrows, &ncols);
+			MatDense<complex<double>, Cpu> gpu2cpu_mat(nrows, ncols);
+			dsm_funcs->tocpu(gpu_prod_mat_dense, (cuDoubleComplex*) reinterpret_cast<double*>(gpu2cpu_mat.getData()));
+			dsm_funcs->free(gpu_prod_mat_dense);
+			return gpu2cpu_mat;
+		}
+
+	template <>
+		Vect<complex<double>, Cpu> FaustGPU<complex<double>>::multiply(const Vect<complex<double>,Cpu>& v, const bool transpose, const bool conjugate)
+		{
+			std::cout << "FaustGPU::multiply(Vect)" << std::endl;
+			cuDoubleComplex one;
+			set_one<cuDoubleComplex>(&one);
+			int32_t out_size = this->ncols; // default is transpose here
+
+			gm_Op op;
+			if(transpose && conjugate)
+				op = OP_CONJTRANSP;
+			else if(transpose)
+				op = OP_TRANSP;
+			else
+			{
+				op = OP_NOTRANSP;
+				out_size = this->nrows;
+			}
+
+			Vect<complex<double>, Cpu> out_vec(out_size);
+
+			gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			marr_funcs->chain_matmul_by_cpu_dsm_tocpu(gpu_mat_arr, one, op, (cuDoubleComplex*) reinterpret_cast<double*>(const_cast<complex<double>*>(v.getData())), v.size(), 1, (cuDoubleComplex*) reinterpret_cast<double*>(out_vec.getData()));
+
+			return out_vec;
+		}
+
+	template <>
+		MatDense<complex<double>, Cpu> FaustGPU<complex<double>>::multiply(const MatGeneric<complex<double>,Cpu>* A, const bool transpose, const bool conjugate)
+		{
+			std::cout << "FaustGPU::multiply(MatGeneric)" << std::endl;
+			const MatSparse<complex<double>, Cpu>* sp_mat;
+			const MatDense<complex<double>, Cpu>* ds_mat;
+			int32_t out_nrows;
+			cuDoubleComplex one;
+			set_one<cuDoubleComplex>(&one);
+			gm_Op op;
+			if(transpose && conjugate)
+				op = OP_CONJTRANSP;
+			else if(transpose)
+				op = OP_TRANSP;
+			else
+				op = OP_NOTRANSP;
+
+			if(transpose)
+				out_nrows = this->ncols;
+			else
+				out_nrows = this->nrows;
+
+			MatDense<complex<double>, Cpu> out_mat(out_nrows, A->getNbCol());
+
+			gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			if(sp_mat = dynamic_cast<const MatSparse<complex<double>,Cpu>*>(A))
+			{
+
+				throw std::runtime_error("FaustGPU::multiply() by MatSparse isn't yet impl.");
+				//		std::cout << "FaustGPU::multiply(MatSparse): " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+				marr_funcs->togpu_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), (int32_t *) sp_mat->getOuterIndexPtr(), (int32_t*) sp_mat->getInnerIndexPtr(), (cuDoubleComplex*) reinterpret_cast<double*>(const_cast<complex<double>*>(sp_mat->getValuePtr())));
+			}
+			else if(ds_mat = dynamic_cast<const MatDense<complex<double>,Cpu>*>(A))
+			{
+
+//				std::cout << "FaustGPU::multiply(MatDense): " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+				marr_funcs->chain_matmul_by_cpu_dsm_tocpu(gpu_mat_arr, one, op, (cuDoubleComplex*) reinterpret_cast<double*>(const_cast<complex<double>*>(ds_mat->getData())), ds_mat->getNbRow(), ds_mat->getNbCol(), (cuDoubleComplex*) reinterpret_cast<double*>(out_mat.getData()));
+			}
+
+			return out_mat;
+		}
+
+	template <>
+	void FaustGPU<complex<double>>::pop_front()
+	{
+		if(cpu_mat_ptrs.size() > 0)
+		{
+			if(use_ref_man)
+				ref_man.release(*(cpu_mat_ptrs.begin()));
+			auto marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			marr_funcs->erase_at(gpu_mat_arr, 0, !use_ref_man);
+			cpu_mat_ptrs.erase(cpu_mat_ptrs.begin());
+		}
+	}
+
+	template <>
+	void FaustGPU<complex<double>>::pop_back()
+	{
+		if(cpu_mat_ptrs.size() > 0)
+		{
+			if(use_ref_man)
+				ref_man.release(*(cpu_mat_ptrs.end()-1));
+			auto marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			marr_funcs->erase_at(gpu_mat_arr, cpu_mat_ptrs.size()-1, !use_ref_man);
+			cpu_mat_ptrs.erase(cpu_mat_ptrs.end()-1);
+		}
+	}
+
+	template <>
+	void FaustGPU<complex<double>>::push_back(MatGeneric<complex<double>,Cpu>* M)
+	{
+//		std::cout << "push_back M=" << M << " size: " << cpu_mat_ptrs.size() << " this:" << this << std::endl;
+		MatSparse<complex<double>, Cpu>* sp_mat;
+		MatDense<complex<double>, Cpu>* ds_mat;
+		void* gpu_ref; //sp or ds mat
+
+		auto dsm_funcs = (gm_DenseMatFunc_cuDoubleComplex*) this->dsm_funcs;
+		auto marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+		auto gp_funcs = (gm_GenPurposeFunc_cuDoubleComplex*) this->gp_funcs;
+
+
+		if(use_ref_man && cpu_gpu_map.find(M) != cpu_gpu_map.end())
+		{
+			// already known cpu, gpu mats
+
+			// add the gpu matrix to gpu mat list
+			if(dynamic_cast<MatDense<complex<double>,Cpu>*>(M))
+			{
+//				std::cout << "add the gpu dense matrix to gpu mat list" << std::endl;
+				marr_funcs->addgpu_dsm(gpu_mat_arr, cpu_gpu_map[M]);
+			}
+			else
+			{
+//				std::cout << "add the gpu sparse matrix to gpu mat list" << std::endl;
+				// M is sparse
+				marr_funcs->addgpu_spm(gpu_mat_arr, cpu_gpu_map[M]);
+			}
+
+			cpu_mat_ptrs.push_back(M);
+
+			ref_man.acquire(M);
+
+			return;
+		}
+
+		if(sp_mat = dynamic_cast<MatSparse<complex<double>,Cpu>*>(M))
+		{
+
+//			std::cout << "FaustGPU(): " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->togpu_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (cuDoubleComplex*) reinterpret_cast<double*>(sp_mat->getValuePtr()));
+//			std::cout << "after togpu_spm" << std::endl;
+		}
+		else if(ds_mat = dynamic_cast<MatDense<complex<double>,Cpu>*>(M))
+		{
+//			std::cout << "FaustGPU(): " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->togpu_dsm(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (cuDoubleComplex*) reinterpret_cast<double*>(ds_mat->getData()));
+//			std::cout << "after togpu_dsm" << std::endl;
+		}
+
+		cpu_mat_ptrs.push_back(M);
+		if(use_ref_man)
+		{
+			cpu_gpu_map[M] = gpu_ref;
+			ref_man.acquire(M);
+		}
+	}
+
+	template <>
+		FaustGPU<complex<double>>::~FaustGPU()
+		{
+			gm_users--;
+			gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			if(use_ref_man)
+			{
+				// release all gpu mats
+				for(auto m: cpu_mat_ptrs)
+				{
+//					if(ref_man.contains(m)) // useless assuming we release only the acquired factors
+						ref_man.release(m);
+				}
+			}
+			marr_funcs->free(gpu_mat_arr, ! use_ref_man); // delete used mats only if it doesn't use ref_man
+			if(gm_users <= 0)
+			{
+				gm_close_lib(gm_handle);
+				delete (gm_MatArrayFunc_cuDoubleComplex*) marr_funcs;
+//				gm_GenPurposeFunc_cuDoubleComplex* marr_funcs = (gm_GenPurposeFunc_cuDoubleComplex*) this->gp_funcs;
+				delete (gm_DenseMatFunc_cuDoubleComplex*) dsm_funcs;
+//				delete (gm_GenPurposeFunc_cuDoubleComplex*) gp_funcs;
+			}
+		}
+
+	template<>
+	Faust::RefManager FaustGPU<complex<double>>::ref_man([](void *fact)
+		{
+			gm_GenPurposeFunc_cuDoubleComplex* gp_funcs = (gm_GenPurposeFunc_cuDoubleComplex*) Faust::FaustGPU<complex<double>>::gp_funcs;
+			//normally cpu_gpu_map must contains a the key fac if ref_man knew it (see ctor)
+			gp_funcs->free_mat(Faust::FaustGPU<complex<double>>::cpu_gpu_map[fact]);
+			Faust::FaustGPU<complex<double>>::cpu_gpu_map.erase(fact);
+		});
+
+	template<>
+		FaustGPU<complex<double>>::FaustGPU(const std::vector<MatGeneric<complex<double>,Cpu>*>& factors) : use_ref_man(true)
+	{
+		//	std::cout << "FaustGPU<complex<double>>::FaustGPU()" << " marr_funcs:" << marr_funcs << std::endl;
+		check_gpu_mod_loaded();
+		this->load_gm_functions(); //lazy instantiation
+
+		gm_DenseMatFunc_cuDoubleComplex* dsm_funcs = (gm_DenseMatFunc_cuDoubleComplex*) this->dsm_funcs;
+		gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+		gm_GenPurposeFunc_cuDoubleComplex* gp_funcs = (gm_GenPurposeFunc_cuDoubleComplex*) this->gp_funcs;
+
+		//	std::cout << "FaustGPU<complex<double>>::FaustGPU() factors size: " << factors.size() << " marr_funcs:" << marr_funcs << std::endl;
+		gpu_mat_arr = marr_funcs->create();
+		//	std::cout << "FaustGPU<complex<double>>::FaustGPU() factors size: " << factors.size() << std::endl;
+		nrows = factors[0]->getNbRow();
+		ncols = (*(factors.end()-1))->getNbCol();
+		for(auto m: factors)
+		{
+			push_back(m);
+		}
+		//			std::cout << "FaustGPU() factors size, matarray size: " << factors.size() << " " << marr_funcs->size(gpu_mat_arr) << std::endl;
+	}
+
+
+	template <>
+	void FaustGPU<complex<double>>::update(const Faust::MatGeneric<complex<double>,Cpu>* M, int32_t id)
+	{
+//		std::cout << "update M=" << M << " id=" << id << " this:" << this << std::endl;
+		MatGeneric<complex<double>,Cpu>* M_ = const_cast<MatGeneric<complex<double>,Cpu>*>(M);
+		// I promise I won't touch M_ data!
+		if(M != cpu_mat_ptrs[id])
+		{
+//			std::cout << "M: " << M << ", cpu_mat_ptrs[id]: " << cpu_mat_ptrs[id] << std::endl;
+			throw std::runtime_error("It's not authorized to update from another cpu matrix than the original one.");
+		}
+
+		gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+		MatSparse<complex<double>, Cpu>* sp_mat;
+		MatDense<complex<double>, Cpu>* ds_mat;
+		void* gpu_ref;
+
+		// if the dims are not equal between M_ and the gpu mat, an exception will be raised by gpu_mod
+		if(sp_mat = dynamic_cast<MatSparse<complex<double>,Cpu>*>(M_))
+		{
+
+//			std::cout << "FaustGPU::update() sparse: " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->cpu_set_spm_at(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (cuDoubleComplex*) reinterpret_cast<double*>(sp_mat->getValuePtr()), id);
+		}
+		else if(ds_mat = dynamic_cast<MatDense<complex<double>,Cpu>*>(M_))
+		{
+
+//			std::cout << "FaustGPU::update() dense: " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+			gpu_ref =  marr_funcs->cpu_set_dsm_at(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (cuDoubleComplex*) reinterpret_cast<double*>(ds_mat->getData()), id);
+		}
+		// gpu_ref is not recorded because this is an assignment of data but the pointers don't change
+
+	}
+
+	template <>
+	void FaustGPU<complex<double>>::insert(const Faust::MatGeneric<complex<double>,Cpu>* M, int32_t id)
+	{
+		MatGeneric<complex<double>,Cpu>* M_ = const_cast<MatGeneric<complex<double>,Cpu>*>(M);
+		gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+		MatSparse<complex<double>, Cpu>* sp_mat;
+		MatDense<complex<double>, Cpu>* ds_mat;
+		void* gpu_ref;
+		if(use_ref_man && cpu_gpu_map.find(M_) != cpu_gpu_map.end())
+		{
+			// already known cpu, gpu mats
+			// add the gpu matrix to gpu mat list
+//			std::cout << "insert at " << id << " the gpu matrix to gpu mat list reused M=" << M << " this:" << this << std::endl;
+			if(dynamic_cast<MatDense<complex<double>,Cpu>*>(M_))
+				marr_funcs->insert_dsm(gpu_mat_arr, cpu_gpu_map[M_], id);
+			else
+				// M is sparse
+				marr_funcs->insert_spm(gpu_mat_arr, cpu_gpu_map[M_], id);
+
+			cpu_mat_ptrs.insert(cpu_mat_ptrs.begin()+id,M_);
+
+			ref_man.acquire(M_);
+
+			return;
+		}
+
+		// if the dims are not equal between M_ and the gpu mat, an exception will be raised by gpu_mod
+		if(sp_mat = dynamic_cast<MatSparse<complex<double>,Cpu>*>(M_))
+		{
+
+			gpu_ref = marr_funcs->togpu_insert_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (cuDoubleComplex*) reinterpret_cast<double*>(sp_mat->getValuePtr()), id);
+		}
+		else if(ds_mat = dynamic_cast<MatDense<complex<double>,Cpu>*>(M_))
+		{
+
+			gpu_ref =  marr_funcs->togpu_insert_dsm(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (cuDoubleComplex*) reinterpret_cast<double*>(ds_mat->getData()), id);
+		}
+
+		cpu_mat_ptrs.insert(cpu_mat_ptrs.begin()+id, M_);
+
+
+//		std::cout << "insert at " << id << " the gpu matrix to gpu mat list M=" << M << " this:" << this << std::endl;
+
+		if(use_ref_man)
+		{
+			cpu_gpu_map[M_] = gpu_ref;
+			ref_man.acquire(M_);
+		}
+
+	}
+
+	template<>
+		Real<complex<double>> FaustGPU<complex<double>>::spectral_norm(int32_t max_iter, Real<complex<double>> threshold)
+		{
+			std::cout << "FaustGPU::spectral_norm" << std::endl;
+			gm_MatArrayFunc_cuDoubleComplex* marr_funcs = (gm_MatArrayFunc_cuDoubleComplex*) this->marr_funcs;
+			return marr_funcs->spectral_norm(gpu_mat_arr, (float) threshold, max_iter);
+		}
+}
--- a/src/faust_linear_operator/CPU/faust_gpu_mod_double.hpp
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod_double.hpp
+#include "faust_TransformHelper.h"
+namespace Faust
+{
+	template <>
+	void FaustGPU<double>::load_gm_functions()
+	{
+		gm_DenseMatFunc_double* dsm_funcs;
+		gm_MatArrayFunc_double* marr_funcs;
+		gm_GenPurposeFunc_double* gp_funcs;
+
+		if(FaustGPU<double>::marr_funcs == nullptr)
+		{
+			marr_funcs = new gm_MatArrayFunc_double(); // on the heap because because it cannot be shared among FaustGPU instances if on the stack
+			dsm_funcs = new gm_DenseMatFunc_double();
+			gp_funcs = new gm_GenPurposeFunc_double();
+			load_marr_funcs_double(gm_handle, marr_funcs);
+			load_dsm_funcs_double(gm_handle, dsm_funcs);
+			load_gp_funcs_double(gm_handle, gp_funcs);
+			FaustGPU<double>::marr_funcs = marr_funcs;
+			FaustGPU<double>::dsm_funcs = dsm_funcs;
+			FaustGPU<double>::gp_funcs = gp_funcs;
+		}
+
+	}
+
+
+	template <>
+		MatDense<double,Cpu> FaustGPU<double>::get_product(const bool transpose /* = false */, const bool conjugate /* = false */)
+		{
+			gm_Op op = OP_NOTRANSP;
+			if(transpose)
+				if(conjugate)
+					op = OP_CONJTRANSP;
+				else
+					op = OP_TRANSP;
+			gm_DenseMatFunc_double* dsm_funcs = (gm_DenseMatFunc_double*) this->dsm_funcs;
+			gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			double one;
+			set_one<double>(&one);
+			auto gpu_prod_mat_dense = marr_funcs->chain_matmul(gpu_mat_arr, one, op);
+			int32_t nrows, ncols;
+			dsm_funcs->info(gpu_prod_mat_dense, &nrows, &ncols);
+			MatDense<double, Cpu> gpu2cpu_mat(nrows, ncols);
+			dsm_funcs->tocpu(gpu_prod_mat_dense, (double*) reinterpret_cast<double*>(gpu2cpu_mat.getData()));
+			dsm_funcs->free(gpu_prod_mat_dense);
+			return gpu2cpu_mat;
+		}
+
+	template <>
+		Vect<double, Cpu> FaustGPU<double>::multiply(const Vect<double,Cpu>& v, const bool transpose, const bool conjugate)
+		{
+			std::cout << "FaustGPU::multiply(Vect)" << std::endl;
+			double one;
+			set_one<double>(&one);
+			int32_t out_size = this->ncols; // default is transpose here
+
+			gm_Op op;
+			if(transpose && conjugate)
+				op = OP_CONJTRANSP;
+			else if(transpose)
+				op = OP_TRANSP;
+			else
+			{
+				op = OP_NOTRANSP;
+				out_size = this->nrows;
+			}
+
+			Vect<double, Cpu> out_vec(out_size);
+
+			gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			marr_funcs->chain_matmul_by_cpu_dsm_tocpu(gpu_mat_arr, one, op, (double*) reinterpret_cast<double*>(const_cast<double*>(v.getData())), v.size(), 1, (double*) reinterpret_cast<double*>(out_vec.getData()));
+
+			return out_vec;
+		}
+
+	template <>
+		MatDense<double, Cpu> FaustGPU<double>::multiply(const MatGeneric<double,Cpu>* A, const bool transpose, const bool conjugate)
+		{
+			std::cout << "FaustGPU::multiply(MatGeneric)" << std::endl;
+			const MatSparse<double, Cpu>* sp_mat;
+			const MatDense<double, Cpu>* ds_mat;
+			int32_t out_nrows;
+			double one;
+			set_one<double>(&one);
+			gm_Op op;
+			if(transpose && conjugate)
+				op = OP_CONJTRANSP;
+			else if(transpose)
+				op = OP_TRANSP;
+			else
+				op = OP_NOTRANSP;
+
+			if(transpose)
+				out_nrows = this->ncols;
+			else
+				out_nrows = this->nrows;
+
+			MatDense<double, Cpu> out_mat(out_nrows, A->getNbCol());
+
+			gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			if(sp_mat = dynamic_cast<const MatSparse<double,Cpu>*>(A))
+			{
+
+				throw std::runtime_error("FaustGPU::multiply() by MatSparse isn't yet impl.");
+				//		std::cout << "FaustGPU::multiply(MatSparse): " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+				marr_funcs->togpu_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), (int32_t *) sp_mat->getOuterIndexPtr(), (int32_t*) sp_mat->getInnerIndexPtr(), (double*) reinterpret_cast<double*>(const_cast<double*>(sp_mat->getValuePtr())));
+			}
+			else if(ds_mat = dynamic_cast<const MatDense<double,Cpu>*>(A))
+			{
+
+//				std::cout << "FaustGPU::multiply(MatDense): " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+				marr_funcs->chain_matmul_by_cpu_dsm_tocpu(gpu_mat_arr, one, op, (double*) reinterpret_cast<double*>(const_cast<double*>(ds_mat->getData())), ds_mat->getNbRow(), ds_mat->getNbCol(), (double*) reinterpret_cast<double*>(out_mat.getData()));
+			}
+
+			return out_mat;
+		}
+
+	template <>
+	void FaustGPU<double>::pop_front()
+	{
+		if(cpu_mat_ptrs.size() > 0)
+		{
+			if(use_ref_man)
+				ref_man.release(*(cpu_mat_ptrs.begin()));
+			auto marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			marr_funcs->erase_at(gpu_mat_arr, 0, !use_ref_man);
+			cpu_mat_ptrs.erase(cpu_mat_ptrs.begin());
+		}
+	}
+
+	template <>
+	void FaustGPU<double>::pop_back()
+	{
+		if(cpu_mat_ptrs.size() > 0)
+		{
+			if(use_ref_man)
+				ref_man.release(*(cpu_mat_ptrs.end()-1));
+			auto marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			marr_funcs->erase_at(gpu_mat_arr, cpu_mat_ptrs.size()-1, !use_ref_man);
+			cpu_mat_ptrs.erase(cpu_mat_ptrs.end()-1);
+		}
+	}
+
+	template <>
+	void FaustGPU<double>::push_back(MatGeneric<double,Cpu>* M)
+	{
+//		std::cout << "push_back M=" << M << " size: " << cpu_mat_ptrs.size() << " this:" << this << std::endl;
+		MatSparse<double, Cpu>* sp_mat;
+		MatDense<double, Cpu>* ds_mat;
+		void* gpu_ref; //sp or ds mat
+
+		auto dsm_funcs = (gm_DenseMatFunc_double*) this->dsm_funcs;
+		auto marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+		auto gp_funcs = (gm_GenPurposeFunc_double*) this->gp_funcs;
+
+
+		if(use_ref_man && cpu_gpu_map.find(M) != cpu_gpu_map.end())
+		{
+			// already known cpu, gpu mats
+
+			// add the gpu matrix to gpu mat list
+			if(dynamic_cast<MatDense<double,Cpu>*>(M))
+			{
+//				std::cout << "add the gpu dense matrix to gpu mat list" << std::endl;
+				marr_funcs->addgpu_dsm(gpu_mat_arr, cpu_gpu_map[M]);
+			}
+			else
+			{
+//				std::cout << "add the gpu sparse matrix to gpu mat list" << std::endl;
+				// M is sparse
+				marr_funcs->addgpu_spm(gpu_mat_arr, cpu_gpu_map[M]);
+			}
+
+			cpu_mat_ptrs.push_back(M);
+
+			ref_man.acquire(M);
+
+			return;
+		}
+
+		if(sp_mat = dynamic_cast<MatSparse<double,Cpu>*>(M))
+		{
+
+//			std::cout << "FaustGPU(): " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->togpu_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (double*) reinterpret_cast<double*>(sp_mat->getValuePtr()));
+//			std::cout << "after togpu_spm" << std::endl;
+		}
+		else if(ds_mat = dynamic_cast<MatDense<double,Cpu>*>(M))
+		{
+//			std::cout << "FaustGPU(): " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->togpu_dsm(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (double*) reinterpret_cast<double*>(ds_mat->getData()));
+//			std::cout << "after togpu_dsm" << std::endl;
+		}
+
+		cpu_mat_ptrs.push_back(M);
+		if(use_ref_man)
+		{
+			cpu_gpu_map[M] = gpu_ref;
+			ref_man.acquire(M);
+		}
+	}
+
+	template <>
+		FaustGPU<double>::~FaustGPU()
+		{
+			gm_users--;
+			gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			if(use_ref_man)
+			{
+				// release all gpu mats
+				for(auto m: cpu_mat_ptrs)
+				{
+//					if(ref_man.contains(m)) // useless assuming we release only the acquired factors
+						ref_man.release(m);
+				}
+			}
+			marr_funcs->free(gpu_mat_arr, ! use_ref_man); // delete used mats only if it doesn't use ref_man
+			if(gm_users <= 0)
+			{
+				gm_close_lib(gm_handle);
+				delete (gm_MatArrayFunc_double*) marr_funcs;
+//				gm_GenPurposeFunc_double* marr_funcs = (gm_GenPurposeFunc_double*) this->gp_funcs;
+				delete (gm_DenseMatFunc_double*) dsm_funcs;
+//				delete (gm_GenPurposeFunc_double*) gp_funcs;
+			}
+		}
+
+	template<>
+	Faust::RefManager FaustGPU<double>::ref_man([](void *fact)
+		{
+			gm_GenPurposeFunc_double* gp_funcs = (gm_GenPurposeFunc_double*) Faust::FaustGPU<double>::gp_funcs;
+			//normally cpu_gpu_map must contains a the key fac if ref_man knew it (see ctor)
+			gp_funcs->free_mat(Faust::FaustGPU<double>::cpu_gpu_map[fact]);
+			Faust::FaustGPU<double>::cpu_gpu_map.erase(fact);
+		});
+
+	template<>
+		FaustGPU<double>::FaustGPU(const std::vector<MatGeneric<double,Cpu>*>& factors) : use_ref_man(true)
+	{
+		//	std::cout << "FaustGPU<double>::FaustGPU()" << " marr_funcs:" << marr_funcs << std::endl;
+		check_gpu_mod_loaded();
+		this->load_gm_functions(); //lazy instantiation
+
+		gm_DenseMatFunc_double* dsm_funcs = (gm_DenseMatFunc_double*) this->dsm_funcs;
+		gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+		gm_GenPurposeFunc_double* gp_funcs = (gm_GenPurposeFunc_double*) this->gp_funcs;
+
+		//	std::cout << "FaustGPU<double>::FaustGPU() factors size: " << factors.size() << " marr_funcs:" << marr_funcs << std::endl;
+		gpu_mat_arr = marr_funcs->create();
+		//	std::cout << "FaustGPU<double>::FaustGPU() factors size: " << factors.size() << std::endl;
+		nrows = factors[0]->getNbRow();
+		ncols = (*(factors.end()-1))->getNbCol();
+		for(auto m: factors)
+		{
+			push_back(m);
+		}
+		//			std::cout << "FaustGPU() factors size, matarray size: " << factors.size() << " " << marr_funcs->size(gpu_mat_arr) << std::endl;
+	}
+
+
+	template <>
+	void FaustGPU<double>::update(const Faust::MatGeneric<double,Cpu>* M, int32_t id)
+	{
+//		std::cout << "update M=" << M << " id=" << id << " this:" << this << std::endl;
+		MatGeneric<double,Cpu>* M_ = const_cast<MatGeneric<double,Cpu>*>(M);
+		// I promise I won't touch M_ data!
+		if(M != cpu_mat_ptrs[id])
+		{
+//			std::cout << "M: " << M << ", cpu_mat_ptrs[id]: " << cpu_mat_ptrs[id] << std::endl;
+			throw std::runtime_error("It's not authorized to update from another cpu matrix than the original one.");
+		}
+
+		gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+		MatSparse<double, Cpu>* sp_mat;
+		MatDense<double, Cpu>* ds_mat;
+		void* gpu_ref;
+
+		// if the dims are not equal between M_ and the gpu mat, an exception will be raised by gpu_mod
+		if(sp_mat = dynamic_cast<MatSparse<double,Cpu>*>(M_))
+		{
+
+//			std::cout << "FaustGPU::update() sparse: " << sp_mat->getNbRow() << " " << sp_mat->getNbCol()<< " " << sp_mat->getNonZeros()<< std::endl;
+			gpu_ref = marr_funcs->cpu_set_spm_at(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (double*) reinterpret_cast<double*>(sp_mat->getValuePtr()), id);
+		}
+		else if(ds_mat = dynamic_cast<MatDense<double,Cpu>*>(M_))
+		{
+
+//			std::cout << "FaustGPU::update() dense: " << ds_mat->getNbRow() << " " << ds_mat->getNbCol()<< " " << ds_mat->getNonZeros()<< std::endl;
+			gpu_ref =  marr_funcs->cpu_set_dsm_at(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (double*) reinterpret_cast<double*>(ds_mat->getData()), id);
+		}
+		// gpu_ref is not recorded because this is an assignment of data but the pointers don't change
+
+	}
+
+	template <>
+	void FaustGPU<double>::insert(const Faust::MatGeneric<double,Cpu>* M, int32_t id)
+	{
+		MatGeneric<double,Cpu>* M_ = const_cast<MatGeneric<double,Cpu>*>(M);
+		gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+		MatSparse<double, Cpu>* sp_mat;
+		MatDense<double, Cpu>* ds_mat;
+		void* gpu_ref;
+		if(use_ref_man && cpu_gpu_map.find(M_) != cpu_gpu_map.end())
+		{
+			// already known cpu, gpu mats
+			// add the gpu matrix to gpu mat list
+//			std::cout << "insert at " << id << " the gpu matrix to gpu mat list reused M=" << M << " this:" << this << std::endl;
+			if(dynamic_cast<MatDense<double,Cpu>*>(M_))
+				marr_funcs->insert_dsm(gpu_mat_arr, cpu_gpu_map[M_], id);
+			else
+				// M is sparse
+				marr_funcs->insert_spm(gpu_mat_arr, cpu_gpu_map[M_], id);
+
+			cpu_mat_ptrs.insert(cpu_mat_ptrs.begin()+id,M_);
+
+			ref_man.acquire(M_);
+
+			return;
+		}
+
+		// if the dims are not equal between M_ and the gpu mat, an exception will be raised by gpu_mod
+		if(sp_mat = dynamic_cast<MatSparse<double,Cpu>*>(M_))
+		{
+
+			gpu_ref = marr_funcs->togpu_insert_spm(gpu_mat_arr, sp_mat->getNbRow(), sp_mat->getNbCol(), sp_mat->getNonZeros(), sp_mat->getOuterIndexPtr(), sp_mat->getInnerIndexPtr(), (double*) reinterpret_cast<double*>(sp_mat->getValuePtr()), id);
+		}
+		else if(ds_mat = dynamic_cast<MatDense<double,Cpu>*>(M_))
+		{
+
+			gpu_ref =  marr_funcs->togpu_insert_dsm(gpu_mat_arr, ds_mat->getNbRow(), ds_mat->getNbCol(), (double*) reinterpret_cast<double*>(ds_mat->getData()), id);
+		}
+
+		cpu_mat_ptrs.insert(cpu_mat_ptrs.begin()+id, M_);
+
+
+//		std::cout << "insert at " << id << " the gpu matrix to gpu mat list M=" << M << " this:" << this << std::endl;
+
+		if(use_ref_man)
+		{
+			cpu_gpu_map[M_] = gpu_ref;
+			ref_man.acquire(M_);
+		}
+
+	}
+
+	template<>
+		Real<double> FaustGPU<double>::spectral_norm(int32_t max_iter, Real<double> threshold)
+		{
+			std::cout << "FaustGPU::spectral_norm" << std::endl;
+			gm_MatArrayFunc_double* marr_funcs = (gm_MatArrayFunc_double*) this->marr_funcs;
+			return marr_funcs->spectral_norm(gpu_mat_arr, (float) threshold, max_iter);
+		}
+}
--- a/src/faust_linear_operator/CPU/faust_gpu_mod_gen.hpp.in
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod_gen.hpp.in
 #include "faust_TransformHelper.h"
 namespace Faust
 {
+	template <>
+	void FaustGPU<@FAUST_SCALAR_FOR_GM@>::load_gm_functions()
+	{
+		gm_DenseMatFunc_@GM_SCALAR@* dsm_funcs;
+		gm_MatArrayFunc_@GM_SCALAR@* marr_funcs;
+		gm_GenPurposeFunc_@GM_SCALAR@* gp_funcs;
+
+		if(FaustGPU<@FAUST_SCALAR_FOR_GM@>::marr_funcs == nullptr)
+		{
+			marr_funcs = new gm_MatArrayFunc_@GM_SCALAR@(); // on the heap because because it cannot be shared among FaustGPU instances if on the stack
+			dsm_funcs = new gm_DenseMatFunc_@GM_SCALAR@();
+			gp_funcs = new gm_GenPurposeFunc_@GM_SCALAR@();
+			load_marr_funcs_@GM_SCALAR@(gm_handle, marr_funcs);
+			load_dsm_funcs_@GM_SCALAR@(gm_handle, dsm_funcs);
+			load_gp_funcs_@GM_SCALAR@(gm_handle, gp_funcs);
+			FaustGPU<@FAUST_SCALAR_FOR_GM@>::marr_funcs = marr_funcs;
+			FaustGPU<@FAUST_SCALAR_FOR_GM@>::dsm_funcs = dsm_funcs;
+			FaustGPU<@FAUST_SCALAR_FOR_GM@>::gp_funcs = gp_funcs;
+		}
+
+	}


 	template <>
@@ -28,6 +49,7 @@ namespace Faust
 	template <>
 		Vect<@FAUST_SCALAR_FOR_GM@, Cpu> FaustGPU<@FAUST_SCALAR_FOR_GM@>::multiply(const Vect<@FAUST_SCALAR_FOR_GM@,Cpu>& v, const bool transpose, const bool conjugate)
 		{
+			std::cout << "FaustGPU::multiply(Vect)" << std::endl;
 			@GM_SCALAR@ one;
 			set_one<@GM_SCALAR@>(&one);
 			int32_t out_size = this->ncols; // default is transpose here
@@ -54,6 +76,7 @@ namespace Faust
 	template <>
 		MatDense<@FAUST_SCALAR_FOR_GM@, Cpu> FaustGPU<@FAUST_SCALAR_FOR_GM@>::multiply(const MatGeneric<@FAUST_SCALAR_FOR_GM@,Cpu>* A, const bool transpose, const bool conjugate)
 		{
+			std::cout << "FaustGPU::multiply(MatGeneric)" << std::endl;
 			const MatSparse<@FAUST_SCALAR_FOR_GM@, Cpu>* sp_mat;
 			const MatDense<@FAUST_SCALAR_FOR_GM@, Cpu>* ds_mat;
 			int32_t out_nrows;
@@ -213,44 +236,27 @@ namespace Faust

 	template<>
 		FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU(const std::vector<MatGeneric<@FAUST_SCALAR_FOR_GM@,Cpu>*>& factors) : use_ref_man(true)
-		{
-			//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU()" << " marr_funcs:" << marr_funcs << std::endl;
-			check_gpu_mod_loaded();
-
-			gm_DenseMatFunc_@GM_SCALAR@* dsm_funcs;
-			gm_MatArrayFunc_@GM_SCALAR@* marr_funcs;
-			gm_GenPurposeFunc_@GM_SCALAR@* gp_funcs;
-
-			if(this->marr_funcs == nullptr)
-			{
-				marr_funcs = new gm_MatArrayFunc_@GM_SCALAR@(); // on the heap because it cannot be shared among FaustGPU instances if on the stack
-				dsm_funcs = new gm_DenseMatFunc_@GM_SCALAR@();
-				gp_funcs = new gm_GenPurposeFunc_@GM_SCALAR@();
-				load_marr_funcs_@GM_SCALAR@(gm_handle, marr_funcs);
-				load_dsm_funcs_@GM_SCALAR@(gm_handle, dsm_funcs);
-				load_gp_funcs_@GM_SCALAR@(gm_handle, gp_funcs);
-				this->marr_funcs = marr_funcs;
-				this->dsm_funcs = dsm_funcs;
-				this->gp_funcs = gp_funcs;
-			}
-			else
-			{
-				dsm_funcs = (gm_DenseMatFunc_@GM_SCALAR@*) this->dsm_funcs;
-				marr_funcs = (gm_MatArrayFunc_@GM_SCALAR@*) this->marr_funcs;
-				gp_funcs = (gm_GenPurposeFunc_@GM_SCALAR@*) this->gp_funcs;
-			}
+	{
+		//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU()" << " marr_funcs:" << marr_funcs << std::endl;
+		check_gpu_mod_loaded();
+		this->load_gm_functions(); //lazy instantiation

-			//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU() factors size: " << factors.size() << " marr_funcs:" << marr_funcs << std::endl;
-			gpu_mat_arr = marr_funcs->create();
-			//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU() factors size: " << factors.size() << std::endl;
-			nrows = factors[0]->getNbRow();
-			ncols = (*(factors.end()-1))->getNbCol();
-			for(auto m: factors)
-			{
-				push_back(m);
-			}
-//			std::cout << "FaustGPU() factors size, matarray size: " << factors.size() << " " << marr_funcs->size(gpu_mat_arr) << std::endl;
+		gm_DenseMatFunc_@GM_SCALAR@* dsm_funcs = (gm_DenseMatFunc_@GM_SCALAR@*) this->dsm_funcs;
+		gm_MatArrayFunc_@GM_SCALAR@* marr_funcs = (gm_MatArrayFunc_@GM_SCALAR@*) this->marr_funcs;
+		gm_GenPurposeFunc_@GM_SCALAR@* gp_funcs = (gm_GenPurposeFunc_@GM_SCALAR@*) this->gp_funcs;
+
+		//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU() factors size: " << factors.size() << " marr_funcs:" << marr_funcs << std::endl;
+		gpu_mat_arr = marr_funcs->create();
+		//	std::cout << "FaustGPU<@FAUST_SCALAR_FOR_GM@>::FaustGPU() factors size: " << factors.size() << std::endl;
+		nrows = factors[0]->getNbRow();
+		ncols = (*(factors.end()-1))->getNbCol();
+		for(auto m: factors)
+		{
+			push_back(m);
 		}
+		//			std::cout << "FaustGPU() factors size, matarray size: " << factors.size() << " " << marr_funcs->size(gpu_mat_arr) << std::endl;
+	}
+

 	template <>
 	void FaustGPU<@FAUST_SCALAR_FOR_GM@>::update(const Faust::MatGeneric<@FAUST_SCALAR_FOR_GM@,Cpu>* M, int32_t id)
@@ -294,7 +300,6 @@ namespace Faust
 		MatSparse<@FAUST_SCALAR_FOR_GM@, Cpu>* sp_mat;
 		MatDense<@FAUST_SCALAR_FOR_GM@, Cpu>* ds_mat;
 		void* gpu_ref;
-
 		if(use_ref_man && cpu_gpu_map.find(M_) != cpu_gpu_map.end())
 		{
 			// already known cpu, gpu mats
@@ -341,6 +346,7 @@ namespace Faust
 	template<>
 		Real<@FAUST_SCALAR_FOR_GM@> FaustGPU<@FAUST_SCALAR_FOR_GM@>::spectral_norm(int32_t max_iter, Real<@FAUST_SCALAR_FOR_GM@> threshold)
 		{
+			std::cout << "FaustGPU::spectral_norm" << std::endl;
 			gm_MatArrayFunc_@GM_SCALAR@* marr_funcs = (gm_MatArrayFunc_@GM_SCALAR@*) this->marr_funcs;
 			return marr_funcs->spectral_norm(gpu_mat_arr, (float) threshold, max_iter);
 		}

--- a/src/faust_linear_operator/GPU2/faust_MatDense_gpu.h
+++ b/src/faust_linear_operator/GPU2/faust_MatDense_gpu.h
@@ -19,7 +19,8 @@ namespace Faust
 //				multiply(const MatDense<Cpu,FPP> &other);
 //				multiply(const MatSparse<Cpu,FPP> &other);
 //				multiply(const Vect<Cpu,FPP> &vec);
-//				multiply(const MatDense<GPU2, FPP> &other);
+				void multiply(MatDense<FPP, GPU2> &other, const char op_this='N');
+				MatDense<FPP, Cpu> tocpu();
 			private:
 				static void* dsm_funcs;
 				gm_DenseMat_t gpu_mat;

--- a/src/faust_linear_operator/GPU2/faust_MatDense_gpu.hpp.in
+++ b/src/faust_linear_operator/GPU2/faust_MatDense_gpu.hpp.in
 //TODO: move to CPP
 template<>
-Faust::MatDense<@FAUST_SCALAR_FOR_GM@,GPU2>::MatDense(const @FAUST_SCALAR_FOR_GM@* data, const faust_unsigned_int nbRow, const faust_unsigned_int nbCol)
+Faust::MatDense<@FAUST_SCALAR_FOR_GM@,GPU2>::MatDense(const @FAUST_SCALAR_FOR_GM@* data,
+		const faust_unsigned_int nbRow,
+		const faust_unsigned_int nbCol)
 {
-	// MatDense from GPU
-
+	this->dim1 = nbRow;
+	this->dim2 = nbCol;
+	FaustGPU<@FAUST_SCALAR_FOR_GM@>::load_gm_functions();
 	if(this->dsm_funcs == nullptr)
-		this->dsm_funcs = new gm_DenseMatFunc_@GM_SCALAR@();
+		this->dsm_funcs = FaustGPU<@FAUST_SCALAR_FOR_GM@>::dsm_funcs;
+	gpu_mat = ((gm_DenseMatFunc_@GM_SCALAR@*)this->dsm_funcs)->togpu(nbRow, nbCol, const_cast<@FAUST_SCALAR_FOR_GM@*>(data));
+}
+
+template<>
+void Faust::MatDense<@FAUST_SCALAR_FOR_GM@,GPU2>::multiply(MatDense<@FAUST_SCALAR_FOR_GM@, GPU2> &other, const char op_this)
+{
+	// other = this * other
+	auto dsm_funcs = ((gm_DenseMatFunc_@GM_SCALAR@*)this->dsm_funcs);
+	dsm_funcs->mul_gpu_dsm_ext(this->gpu_mat, other.gpu_mat, other.gpu_mat, OP_NOTRANSP, OP_NOTRANSP);
+	//TODO: update dims (in transpose/adjoint case)
+	other.dim1 = dim1;
+}

-	gpu_mat = ((gm_DenseMatFunc_@GM_SCALAR@*)this->dsm_funcs)->togpu(nbRow, nbCol, const_cast<double*>(data));
+template<>
+Faust::MatDense<@FAUST_SCALAR_FOR_GM@, Cpu> Faust::MatDense<@FAUST_SCALAR_FOR_GM@,GPU2>::tocpu()
+{
+	MatDense<@FAUST_SCALAR_FOR_GM@, Cpu> cpu_mat(dim1, dim2);
+	auto dsm_funcs = ((gm_DenseMatFunc_@GM_SCALAR@*)this->dsm_funcs);
+	dsm_funcs->tocpu(gpu_mat, cpu_mat.getData());
+	return cpu_mat; //TODO: move constructor for MatDense<FPP, Cpu>
 }