Implement the spectral norm (operator L2 norm) using gpu_mod@7d259afe.

Two methods have been implemented, only the more efficient is enabled (except for a special case which poses problem: see issue #140). Minor change: review member functions and arguments constness in TransformHelper. [skip ci]

Implement the spectral norm (operator L2 norm) using gpu_mod@7d259afe.
528ff0ec · hhakim · 4ba82adb · 528ff0ec · 528ff0ec · 528ff0ec
Commit 528ff0ec authored 4 years ago by hhakim
--- a/src/faust_linear_operator/CPU/faust_TransformHelper.h
+++ b/src/faust_linear_operator/CPU/faust_TransformHelper.h
@@ -106,11 +106,11 @@ namespace Faust {
 			public:
 			TransformHelper(const std::vector<MatGeneric<FPP,Cpu> *>& facts, const FPP lambda_ = (FPP)1.0, const bool optimizedCopy=false, const bool cloning_fact = true, const bool internal_call=false);
 			TransformHelper();
-			TransformHelper(TransformHelper<FPP,Cpu>* th_left, TransformHelper<FPP,Cpu>* th_right);
+			TransformHelper(const TransformHelper<FPP,Cpu>* th_left, const TransformHelper<FPP,Cpu>* th_right);
 			TransformHelper(TransformHelper<FPP,Cpu>* th);
 			TransformHelper(TransformHelper<FPP,Cpu>& th);
 			void operator=(TransformHelper<FPP,Cpu>& th);
-			TransformHelper(TransformHelper<FPP,Cpu>* th, bool transpose, bool conjugate);
+			TransformHelper(const TransformHelper<FPP,Cpu>* th, bool transpose, bool conjugate);
 			TransformHelper(TransformHelper<FPP,Cpu>* th, Slice s[2]);
 			TransformHelper(TransformHelper<FPP,Cpu>* th, faust_unsigned_int* row_ids, faust_unsigned_int num_rows, faust_unsigned_int* col_ids, faust_unsigned_int num_cols);
 			TransformHelper(Transform<FPP,Cpu> &t, const bool moving=false);
@@ -127,7 +127,7 @@ namespace Faust {
 			int get_mul_order_opt_mode() const;
 			MatDense<FPP, Cpu> multiply(const MatSparse<FPP,Cpu> A, const bool transpose=false, const bool conjugate=false);
-			TransformHelper<FPP, Cpu>* multiply(TransformHelper<FPP, Cpu>*);
+			TransformHelper<FPP, Cpu>* multiply(const TransformHelper<FPP, Cpu>*) const;
 			TransformHelper<FPP, Cpu>* multiply(FPP& scalar);
 			template<typename Head, typename ... Tail>
 				void push_back_(Head& h, Tail&... t);
@@ -186,7 +186,7 @@ namespace Faust {
 			double spectralNorm(const int nbr_iter_max, double threshold, int &flag) const;
 			TransformHelper<FPP,Cpu>* transpose();
 			TransformHelper<FPP,Cpu>* conjugate();
-			TransformHelper<FPP,Cpu>* adjoint();
+			TransformHelper<FPP,Cpu>* adjoint() const;
 			TransformHelper<FPP,Cpu>* vertcat(const TransformHelper<FPP,Cpu>*);
 			TransformHelper<FPP,Cpu>* horzcat(const TransformHelper<FPP,Cpu>*);
 			bool isTransposed() const;
@@ -242,7 +242,7 @@ namespace Faust {
 			MatGeneric<FPP,Cpu>* get_gen_fact_nonconst(const faust_unsigned_int id) const;
 			void update(const MatGeneric<FPP,Cpu>& M, const faust_unsigned_int fact_id);
 			private:
-			void copy_slices(TransformHelper<FPP, Cpu>* th, const bool transpose = false);
+			void copy_slices(const TransformHelper<FPP, Cpu>* th, const bool transpose = false);
 			const MatGeneric<FPP,Cpu>* get_gen_fact(const faust_unsigned_int id) const;
 		};
 }

--- a/src/faust_linear_operator/CPU/faust_TransformHelper.hpp
+++ b/src/faust_linear_operator/CPU/faust_TransformHelper.hpp
@@ -96,7 +96,7 @@ namespace Faust {
 	}
 	template<typename FPP>
-		TransformHelper<FPP,Cpu>::TransformHelper(TransformHelper<FPP,Cpu>* th_left, TransformHelper<FPP,Cpu>* th_right)
+		TransformHelper<FPP,Cpu>::TransformHelper(const TransformHelper<FPP,Cpu>* th_left, const TransformHelper<FPP,Cpu>* th_right)
 		: TransformHelper<FPP,Cpu>()
 		{
 			bool right_left_transposed = th_left->is_transposed && th_right->is_transposed;
@@ -115,7 +115,7 @@ namespace Faust {
 		}
 	template<typename FPP>
-		TransformHelper<FPP,Cpu>::TransformHelper(TransformHelper<FPP,Cpu>* th, bool transpose, bool conjugate) : TransformHelper<FPP,Cpu>()
+		TransformHelper<FPP,Cpu>::TransformHelper(const TransformHelper<FPP,Cpu>* th, bool transpose, bool conjugate) : TransformHelper<FPP,Cpu>()
 	{
 		this->transform = th->transform;
 		this->is_transposed = transpose?!th->is_transposed:th->is_transposed;
@@ -735,7 +735,7 @@ namespace Faust {
 		}
 	template<typename FPP>
-		TransformHelper<FPP, Cpu>* TransformHelper<FPP,Cpu>::multiply(TransformHelper<FPP, Cpu>* th_right)
+		TransformHelper<FPP, Cpu>* TransformHelper<FPP,Cpu>::multiply(const TransformHelper<FPP, Cpu>* th_right) const
 		{
 			return new TransformHelper<FPP,Cpu>(this, th_right);
 		}
@@ -1092,7 +1092,7 @@ namespace Faust {
 		}
 	template<typename FPP>
-		void TransformHelper<FPP, Cpu>::copy_slices(TransformHelper<FPP, Cpu> *th, const bool transpose /* default to false */)
+		void TransformHelper<FPP, Cpu>::copy_slices(const TransformHelper<FPP, Cpu> *th, const bool transpose /* default to false */)
 		{
 			this->slices[0].copy(th->slices[0]);
 			this->slices[1].copy(th->slices[1]);
@@ -1252,6 +1252,44 @@ namespace Faust {
 	template<typename FPP>
 		double TransformHelper<FPP,Cpu>::spectralNorm(const int nbr_iter_max, double threshold, int &flag) const
 		{
+#ifdef USE_GPU_MOD
+			if(gpu_faust != nullptr)
+			{
+				bool purely_sparse = true;
+				bool all_squares = true; // it's about factors
+				for(auto f: this->transform->data)
+				{
+					if(dynamic_cast<MatSparse<FPP,Cpu>*>(f) == nullptr)
+						purely_sparse = false;
+					all_squares = all_squares && f->getNbCol() == f->getNbRow();
+				}
+//				std::cout << "purely_sparse=" << purely_sparse << " all_squares=" << all_squares << std::endl;
+				if(purely_sparse && ! all_squares)
+				{
+//					std::cout << "method 1 used for gpu spectral norm" << std::endl;
+					// this method is slower because of the copy on CPU side of the transposed factors to construct FF'
+					TransformHelper<FPP,Cpu> *AtA;
+					TransformHelper<FPP,Cpu>* At = this->adjoint();
+					if(this->getNbCol() < this->getNbRow())
+					{
+						AtA = At->multiply(this);
+					}
+					else
+					{
+						AtA = this->multiply(At);
+					}
+					AtA->enable_gpu_meth_for_mul(); //TODO: it should not be needed if A is already GPU enabled
+					FPP maxAbsValue = std::sqrt(AtA->gpu_faust->power_iteration(nbr_iter_max, threshold, flag));
+					delete AtA;
+				}
+				else
+				{
+//					std::cout << "method 2 used for gpu spectral norm" << std::endl;
+					FPP maxAbsValue = std::sqrt(this->gpu_faust->power_iteration2(nbr_iter_max, threshold, flag));
+					return absValue(maxAbsValue);
+				}
+			}
+#endif
 			vector <MatGeneric<FPP, Cpu>*>& orig_facts = this->transform->data;
 			int start_id, end_id;
 			this->transform->get_nonortho_interior_prod_ids(start_id, end_id);
@@ -1472,7 +1510,7 @@ namespace Faust {
 		}
 	template<typename FPP>
-		TransformHelper<FPP,Cpu>* TransformHelper<FPP,Cpu>::adjoint()
+		TransformHelper<FPP,Cpu>* TransformHelper<FPP,Cpu>::adjoint() const
 		{
 			return new TransformHelper<FPP,Cpu>(this, true, true);
 		}

--- a/src/faust_linear_operator/CPU/faust_gpu_mod.h
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod.h
@@ -53,6 +53,14 @@ namespace Faust
 		void insert(const Faust::MatGeneric<FPP,Cpu>* M, int32_t id);
+		/** TODO: this function should be deleted because it definitely slower than the version 2 */
+		FPP power_iteration(int32_t max_iter, Real<FPP> threshold, int& flag);
+		/** This version differs from power_iteration in the way it computes F'Fv in this order F'(Fv)
+		 * so that it doesn't need to load all factors of F'F into GPU memory but only F.
+		 */
+		FPP power_iteration2(int32_t max_iter, Real<FPP> threshold, int& flag);
 		/* Update on gpu the copy matrix of M (M must have already been loaded otherwise an exception is raised) */
 		void update(const Faust::MatGeneric<FPP,Cpu>* M, int32_t id);

--- a/src/faust_linear_operator/CPU/faust_gpu_mod_gen.hpp.in
+++ b/src/faust_linear_operator/CPU/faust_gpu_mod_gen.hpp.in
@@ -334,4 +334,100 @@ namespace Faust
 		}
 	}
+	template<>
+		@FAUST_SCALAR_FOR_GM@ FaustGPU<@FAUST_SCALAR_FOR_GM@>::power_iteration(int32_t max_iter, Real<@FAUST_SCALAR_FOR_GM@> threshold, int& flag)
+		{
+			int i = 0;
+			int32_t nrows, ncols, xk_bufsz=0, xk2_bufsz=0; //TODO: remove (just here fore test info of xk)
+			gm_MatArrayFunc_@GM_SCALAR@* marr_funcs = (gm_MatArrayFunc_@GM_SCALAR@*) this->marr_funcs;
+			gm_DenseMatFunc_@GM_SCALAR@* dsm_funcs = (gm_DenseMatFunc_@GM_SCALAR@*) this->dsm_funcs;
+			Faust::Vect<@FAUST_SCALAR_FOR_GM@,Cpu> xk_cpu(this->ncols);
+			xk_cpu.setOnes();
+			for(auto p: cpu_mat_ptrs)
+			{
+				auto mat = static_cast<MatGeneric<@FAUST_SCALAR_FOR_GM@,Cpu>*>(p);
+				if(xk_bufsz < mat->getNbCol()) // xk buf receives the result of F'*xk2
+					xk_bufsz = mat->getNbCol();
+				if(xk2_bufsz < mat->getNbRow()) // xk2 buf receives the result of F*xk
+					xk2_bufsz = mat->getNbRow();
+			}
+			auto xk = dsm_funcs->togpu_bufsz(this->ncols, 1, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(xk_cpu.getData()), xk_bufsz, 1);
+			auto xk_norm = dsm_funcs->togpu_bufsz(this->ncols, 1, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(xk_cpu.getData()), xk_bufsz, 1); // the CPU data is irrelevant for xk_norm but it doesn't matter
+			@FAUST_SCALAR_FOR_GM@ lambda_old = 1.0;
+			@FAUST_SCALAR_FOR_GM@ lambda = 0.0;
+			@FAUST_SCALAR_FOR_GM@ alpha = 1.0;
+			@FAUST_SCALAR_FOR_GM@ beta = 0.0;
+			gm_Op op = OP_NOTRANSP;
+//			if(transpose)
+//				if(conjugate)
+//					op = OP_CONJTRANSP;
+//				else
+//					op = OP_TRANSP;
+			@GM_SCALAR@ one;
+			set_one<@GM_SCALAR@>(&one);
+			while((Faust::fabs(lambda_old-lambda)> Faust::fabs(threshold) || Faust::fabs(lambda) <= Faust::fabs(threshold)) && i < max_iter)
+			{
+				i++;
+				lambda_old = lambda;
+				dsm_funcs->copy(xk, xk_norm);
+				dsm_funcs->normalize(xk_norm);
+				/*xk = */marr_funcs->chain_matmul_by_dsm_into(gpu_mat_arr, one, op, xk_norm, xk);
+				dsm_funcs->info(xk, &nrows, &ncols);
+				dsm_funcs->dot(xk, xk_norm, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(&lambda));
+			}
+			flag = (i<max_iter)?i:-1;
+			return lambda;
+		}
+	template<>
+		@FAUST_SCALAR_FOR_GM@ FaustGPU<@FAUST_SCALAR_FOR_GM@>::power_iteration2(int32_t max_iter, Real<@FAUST_SCALAR_FOR_GM@> threshold, int& flag)
+		{
+			int i = 0;
+			int32_t nrows, ncols, xk_bufsz=0, xk2_bufsz=0; //TODO: remove (just here fore test info of xk)
+			gm_MatArrayFunc_@GM_SCALAR@* marr_funcs = (gm_MatArrayFunc_@GM_SCALAR@*) this->marr_funcs;
+			gm_DenseMatFunc_@GM_SCALAR@* dsm_funcs = (gm_DenseMatFunc_@GM_SCALAR@*) this->dsm_funcs;
+			//TODO: allow to work with F'F or FF' // up to this we compute only F'F
+			Faust::Vect<@FAUST_SCALAR_FOR_GM@,Cpu> xk_cpu(this->nrows);
+			Faust::Vect<@FAUST_SCALAR_FOR_GM@,Cpu> xk2_cpu(this->ncols);
+			xk_cpu.setOnes();
+			for(auto p: cpu_mat_ptrs)
+			{
+				auto mat = static_cast<MatGeneric<@FAUST_SCALAR_FOR_GM@,Cpu>*>(p);
+				if(xk_bufsz < mat->getNbCol()) // xk buf receives the result of F'*xk2
+					xk_bufsz = mat->getNbCol();
+				if(xk2_bufsz < mat->getNbRow()) // xk2 buf receives the result of F*xk
+					xk2_bufsz = mat->getNbRow();
+			}
+			auto xk = dsm_funcs->togpu_bufsz(this->ncols, 1, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(xk_cpu.getData()), xk_bufsz, 1);
+			auto xk_norm = dsm_funcs->togpu_bufsz(this->ncols, 1, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(xk_cpu.getData()), xk_bufsz, 1); // the CPU data is irrelevant for xk_norm but it doesn't matter
+			auto xk2 = dsm_funcs->togpu_bufsz(this->nrows, 1, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(xk2_cpu.getData()), xk2_bufsz, 1);
+			@FAUST_SCALAR_FOR_GM@ lambda_old = 1.0;
+			@FAUST_SCALAR_FOR_GM@ lambda = 0.0;
+			@FAUST_SCALAR_FOR_GM@ alpha = 1.0;
+			@FAUST_SCALAR_FOR_GM@ beta = 0.0;
+			//TODO: consider the optimal order: FF'*xk or F'F*xk
+			gm_Op op = OP_NOTRANSP;
+			gm_Op op2 = OP_TRANSP; //TODO: complex case //verify if OP_CONJTRANSP is more costful
+//			if(transpose)
+//				if(conjugate)
+//					op = OP_CONJTRANSP;
+//				else
+//					op = OP_TRANSP;
+			@GM_SCALAR@ one;
+			set_one<@GM_SCALAR@>(&one);
+			while((Faust::fabs(lambda_old-lambda)> Faust::fabs(threshold) || Faust::fabs(lambda) <= Faust::fabs(threshold)) && i < max_iter)
+			{
+				i++;
+				lambda_old = lambda;
+				dsm_funcs->copy(xk, xk_norm);
+				dsm_funcs->normalize(xk_norm);
+				marr_funcs->chain_matmul_by_dsm_into(gpu_mat_arr, one, op, xk_norm, xk2);
+				marr_funcs->chain_matmul_by_dsm_into(gpu_mat_arr, one, op2, xk2, xk);
+				dsm_funcs->dot(xk, xk_norm, (@GM_SCALAR@*) reinterpret_cast<@GM_REINTERPRET_CAST_SCALAR@*>(&lambda));
+			}
+			flag = (i<max_iter)?i:-1;
+			return lambda;
+		}
 }