Optimize cuMatArray mul by scalar for GPU2 -- smallest factor copied (issue #159).

Update to gpu_mod@8a7a155f

Optimize cuMatArray mul by scalar for GPU2 -- smallest factor copied (issue #159).
f6e5b689 · hhakim · d521ca10 · 8a7a155f · 7920c9af · 8a7a155f
Commit f6e5b689 authored 4 years ago by hhakim
--- a/gpu_mod @ 8a7a155f
+++ b/gpu_mod @ 8a7a155f
-Subproject commit 7920c9afc73247d37e3a14dfd4dd458814ba9f01
+Subproject commit 8a7a155f54d37605b381b8586b994dc4ceea45e9
--- a/src/faust_linear_operator/GPU2/faust_TransformHelper_gpu.hpp
+++ b/src/faust_linear_operator/GPU2/faust_TransformHelper_gpu.hpp
@@ -16,13 +16,25 @@ namespace Faust
 		TransformHelper<FPP,GPU2>::TransformHelper(const std::vector<MatGeneric<FPP,GPU2> *>& facts, const FPP lambda_/*= (FPP)1.0*/, const bool optimizedCopy/*=false*/, const bool cloning_fact /*= true*/, const bool internal_call/*=false*/) : TransformHelper<FPP,GPU2>()
 	{
-		//if lambda is not 1.0 the first factor will be multiplied and so it needs to be copied to preserved the original that could be used elsewhere
+		//if lambda is not 1.0 a factor will be multiplied and so it needs to be copied to preserve the original that could be used elsewhere
-		this->push_back(facts[0], false, cloning_fact || lambda_ != (FPP) 1.0);
+		// in an optimization purpose, the smallest factor is copied
-		for(int i=1; i < facts.size(); i++)
+		int min_size_id = 0;
+		if(lambda_ != FPP(1.0))
 		{
-			this->push_back(facts[i], false, cloning_fact);
+			std::vector<int> fact_ids(facts.size());
+			int i = -1;
+			std::generate(fact_ids.begin(), fact_ids.end(), [&i](){return ++i;});
+			std::vector<int>::iterator result = std::min_element(fact_ids.begin(), fact_ids.end(), [&facts](const int &a, const int &b){return facts[a]->getNBytes() < facts[b]->getNBytes();});
+			min_size_id = std::distance(fact_ids.begin(), result);
 		}
-		this->transform->multiply(lambda_);
+		for(int i=0; i < facts.size(); i++)
+		{
+			if(i == min_size_id)
+				this->push_back(facts[min_size_id], false, cloning_fact || lambda_ != (FPP) 1.0);
+			else
+				this->push_back(facts[i], false, cloning_fact);
+		}
+		this->transform->multiply(lambda_, min_size_id);
 	}
 	template<typename FPP>

--- a/src/faust_linear_operator/GPU2/faust_Transform_gpu.cpp.in
+++ b/src/faust_linear_operator/GPU2/faust_Transform_gpu.cpp.in
@@ -371,10 +371,10 @@ namespace Faust
 		}
 	template<>
-		void Transform<@FAUST_SCALAR_FOR_GM@,GPU2>::multiply(const @FAUST_SCALAR_FOR_GM@& a)
+		void Transform<@FAUST_SCALAR_FOR_GM@,GPU2>::multiply(const @FAUST_SCALAR_FOR_GM@& a, const int32_t id/*=-1*/)
 		{
 			auto marr_funcs = GPUModHandler::get_singleton()->marr_funcs((@FAUST_SCALAR_FOR_GM@)(0));
-			marr_funcs->scalar_mul(this->gpu_mat_arr, reinterpret_cast<const @GM_SCALAR@*>(&a));
+			marr_funcs->scalar_mul_id(this->gpu_mat_arr, reinterpret_cast<const @GM_SCALAR@*>(&a), id);
 		}
 	template<>

--- a/src/faust_linear_operator/GPU2/faust_Transform_gpu.h
+++ b/src/faust_linear_operator/GPU2/faust_Transform_gpu.h
@@ -65,7 +65,7 @@ namespace Faust
 			MatDense<FPP,GPU2> multiply(const MatDense<FPP,GPU2> &A, const char opThis);
 			void multiply(const Transform<FPP,GPU2> & A);
 			void multiplyLeft(const Transform<FPP,GPU2> & A);
-			void multiply(const FPP& a);
+			void multiply(const FPP& a, const int32_t id=-1);
 			Vect<FPP,GPU2> multiply(const Vect<FPP,GPU2>& x, const char opThis='N');
 			Real<FPP> spectralNorm(int32_t nb_iter_max, float threshold, int& flag);
 			Real<FPP> normL1(const bool transpose = false) const;