Update faust_torch module.

- Managing torch row major-order inconsistency with faust column major-order representation. Does it by computing like it was the transpose product. It allows to avoid useless but heavy copies. - New function to display tensors. - faust_torch code documentation. - Minor change in faust_torch test.

Update faust_torch module.
73a57855 · hhakim · 4e6a1efb · 73a57855 · 73a57855 · 73a57855
Commit 73a57855 authored 5 years ago by hhakim
--- a/misc/test/src/C++/faust_torch.cpp.in
+++ b/misc/test/src/C++/faust_torch.cpp.in
@@ -32,10 +32,10 @@ vector<double> count(vector<std::chrono::duration<double>> durs)
 double calc_relerr(Tensor t, MatDense<FPP,Cpu> m)
 {
 	MatDense<FPP,Cpu> aux;
-	torch_Tensor_to_faust_MatDense(t, aux);
+	convTensorToMatDense(t, aux);
 	aux -= m;
 //	Tensor taux;
-//	faust_MatDense_to_torch_Tensor(m, taux);
+//	convMatDenseToTensor(m, taux);
 //	taux -= t;
 //	cout << "tens reldiff:" << norm(taux.flatten())/norm(t) << endl;
 	return aux.norm()/m.norm();
@@ -51,7 +51,7 @@ double calc_relerr(MatDense<FPP,Cpu> m1, MatDense<FPP,Cpu> m2)
 double tensor_norm(Tensor t)
 {
 	MatDense<FPP,Cpu> aux;
-	torch_Tensor_to_faust_MatDense(t, aux);
+	convTensorToMatDense(t, aux);
 //	cout << torch::norm(t.flatten())<< endl; //can't static_ast c10::Scalar to double
 	return aux.norm(); 
 }
@@ -79,7 +79,7 @@ double sum_count(vector<double> counts)
 int main(int argc, char** argv)
 {
-	int min_size = 100;
+	int min_size = 128;
 	int max_size = 1024;
 	unsigned int min_nfacts = 10;
 	unsigned int max_nfacts = 20;
@@ -149,10 +149,12 @@ int main(int argc, char** argv)
 		facs.push_back(*it);
 	}
 	/** test Faust to TensorList */
-	faust_matvec_to_torch_TensorList(facs, tl, at::kCPU, /*clone */ false);
+	convMatGenListToTensorList(facs, tl, at::kCPU, /*clone */ false);
+	cout << "Faust to TensorList:" << endl;
+	display_TensorList(tl);
 	t = tensor_chain_mul(tl);
 //	t = torch::chain_matmul(tl);
-	torch_Tensor_to_faust_MatDense(t, aux);
+	convTensorToMatDense(t, aux);
 	cout << "torch toarray norm: " << setprecision(15) << norm(t.flatten()) << endl;
 	cout << "faust toarray norm: " << setprecision(15) << F->normFro() << endl;
 	cout << "toarray faust-torch reldiff: " << calc_relerr(t,F->get_product()) << endl; 
@@ -165,7 +167,7 @@ int main(int argc, char** argv)
 	/** test MatDense to torch::Tensor conv. */
 	auto M = Faust::MatDense<FPP,Cpu>::randMat(F->getNbCol(), F->getNbRow());
 	cout << "norm(M): " << M->norm() << endl;
-	faust_MatDense_to_torch_Tensor(*M, taux);
+	convMatDenseToTensor(*M, taux);
 	cout << "M to tensor error: " << calc_relerr(taux, *M) << endl;
 	int i=0;
 //	for(auto ts : tl)
@@ -178,7 +180,7 @@ int main(int argc, char** argv)
 //	}
 	/** prepare product refs (MatDense and Tensor)*/
 	ref = F->multiply(*M);
-	faust_MatDense_to_torch_Tensor(ref, tref);
+	convMatDenseToTensor(ref, tref);
 	/** Measure time of nsamples F*M (pure Faust) */
 	for(int i=0;i<nsamples;i++) 
 	{
@@ -189,6 +191,7 @@ int main(int argc, char** argv)
 	}
 	errors[PURE_FAUST] = 0;
 	pnorms[PURE_FAUST] = ref.norm();
+	cout << "(1) done." << endl;
 	/** Measure time of nsamples F*M (Faust-torch) */
 	for(int i=0;i<nsamples;i++) 
 	{
@@ -197,6 +200,7 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[FAUST_TORCH].push_back(end - start);
 	}
+	cout << "(2) done." << endl;
 	errors[FAUST_TORCH] = calc_relerr(ref, out);
 	pnorms[FAUST_TORCH] = out.norm();
 	for(int i=0;i<nsamples;i++) 
@@ -206,6 +210,7 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[FAUST_TORCH_CHAIN_OPT].push_back(end - start);
 	}
+	cout << "(3) done." << endl;
 	errors[FAUST_TORCH_CHAIN_OPT] = calc_relerr(ref, out);
 	pnorms[FAUST_TORCH_CHAIN_OPT] = out.norm();
 	for(int i=0;i<nsamples;i++) 
@@ -215,11 +220,12 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[FAUST_TORCH_CONTIGUOUS_DENSE_TO_TORCH].push_back(end - start);
 	}
+	cout << "(4) done." << endl;
 	errors[FAUST_TORCH_CONTIGUOUS_DENSE_TO_TORCH] = calc_relerr(ref, out);
 	pnorms[FAUST_TORCH_CONTIGUOUS_DENSE_TO_TORCH] = out.norm();
 	/** Measure time of nsamples F*M (Faust-torch without accounting matrix-to-tensor conversion time) */
 	Tensor tM;
-	faust_MatDense_to_torch_Tensor(*M, tM);
+	convMatDenseToTensor(*M, tM);
 	for(int i=0;i<nsamples;i++) 
 	{
 		auto start = std::chrono::system_clock::now();
@@ -227,6 +233,7 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[PURE_TORCH].push_back(end - start);
 	}
+	cout << "(5) done." << endl;
 	errors[PURE_TORCH] = calc_relerr(t, ref);
 	pnorms[PURE_TORCH] = tensor_norm(t);
 	for(int i=0;i<nsamples;i++)
@@ -236,6 +243,7 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[PURE_TORCH_CHAIN_OPT].push_back(end - start);
 	}
+	cout << "(6) done." << endl;
 	errors[PURE_TORCH_CHAIN_OPT] = calc_relerr(t, ref);
 	pnorms[PURE_TORCH_CHAIN_OPT] = tensor_norm(t);
 	for(int i=0;i<nsamples;i++)
@@ -245,6 +253,7 @@ int main(int argc, char** argv)
 		auto end = std::chrono::system_clock::now();
 		times[PURE_TORCH_CONTIGUOUS_DENSE_TO_TORCH].push_back(end - start);
 	}
+	cout << "(7) done." << endl;
 	errors[PURE_TORCH_CONTIGUOUS_DENSE_TO_TORCH] = calc_relerr(t, ref);
 	pnorms[PURE_TORCH_CONTIGUOUS_DENSE_TO_TORCH] = tensor_norm(t);
 	for(int i=0; i < median_times.size(); i ++)
@@ -266,6 +275,6 @@ int main(int argc, char** argv)
 	cout << "stats: median time (secs) / speedup / cumutime / errVSFaust / pnorms" << endl;
 	for(int i=0;i<nmeths;i++)
 	{
-		cout << "("<<i+1<<"): " <</* count(time_pure_faust) <<*/ median_times[i] << " / " << speedups[i] << " / " << cumu_times[i] << " / " << errors[i] <<  " / " << pnorms[i] << endl;
+		cout << "("<<i+1<<"): " <</* count(time_pure_faust) <<*/ median_times[i] << " / " << setprecision(3) << speedups[i] << " / " << cumu_times[i] << " / " << errors[i] <<  " / " << pnorms[i] << endl;
 	}
 }
--- a/src/faust_linear_operator/CPU/faust_torch.h
+++ b/src/faust_linear_operator/CPU/faust_torch.h
@@ -13,12 +13,19 @@ namespace Faust
 	 *
 	 * \param dev the device at::kCPU or at::kCUDA.
 	 * \param clone true to copy the Faust matrix data to create the tensor, false to use the same data without copying (false by default).
+	 * \param transpose is true by default because it's more efficient to handle the difference of data storage between torch and faust ; row-major order for torch, column-major order for faust.
 	 */
 	template<typename FPP, FDevice D>
-		void faust_MatDense_to_torch_Tensor(const Faust::MatDense<FPP,D> & dm, torch::Tensor & t, at::DeviceType dev = at::kCPU, const bool clone = false);
+		void convMatDenseToTensor(const Faust::MatDense<FPP,D> & dm, torch::Tensor & t, at::DeviceType dev = at::kCPU, const bool clone = false, const bool transpose = true);
+	/**
+	 * Converts a torch::Tensor (t) to Faust::MatDense (dm).
+	 *
+	 * \param transpose see convMatDenseToTensor.
+	 *
+	 */
 	template<typename FPP, FDevice D>
-		void torch_Tensor_to_faust_MatDense(const torch::Tensor & t, Faust::MatDense<FPP,D> & dm);
+		void convTensorToMatDense(const torch::Tensor & t, Faust::MatDense<FPP,D> & dm, const bool transpose = true);
 	/**
@@ -28,35 +35,54 @@ namespace Faust
 	 *
 	 *\param dev the device at::kCPU or at::kCUDA.
 	 *\param clone true to copy the Faust matrix data to create the tensor, false to use the same data without copying (false by default).
+	 *\param transpose is true by default because it's more efficient to handle the difference of data storage between torch and faust ; row-major order for torch, column-major order for faust. For MatSparse it does not really matter (because data copying is still necessary) but it's preferable to be consistent with what is done for MatDense.
 	 */
 	template<typename FPP, FDevice D>
-		void faust_MatSparse_to_torch_Tensor(Faust::MatSparse<FPP,D> & spm, torch::Tensor & t, at::DeviceType dev = at::kCPU,  const bool clone = false);
+		void convMatSparseToTensor(const Faust::MatSparse<FPP,D> & spm, torch::Tensor & t, at::DeviceType dev, const bool clone, const bool transpose = true);
 	/**
 	 * Converts a Faust::MatGeneric vector to a torch::TensorList (vector alias).
 	 *
 	 *\param dev the device at::kCPU or at::kCUDA.
 	 * \param clone true to copy the Faust matrices data to create the tensors, false to use the same data without copying (false by default).
+	 * \param transpose to true implies that the ml factors will be converted and stored into tl in reverse order (the goal is to compute efficiently the ml product -- cf. tensor_chain_mul, through tl defined as the transpose product. It's more efficient).
 	 */
 	template<typename FPP, FDevice D>
-		void faust_matvec_to_torch_TensorList(const std::vector<Faust::MatGeneric<FPP,D>*> & ml, std::vector<torch::Tensor> &tl, at::DeviceType dev = at::kCPU, const bool clone = false);
+		void convMatGenListToTensorList(const std::vector<Faust::MatGeneric<FPP,D>*> & ml, std::vector<torch::Tensor> &tl, at::DeviceType dev = at::kCPU, const bool clone = false, const bool transpose = true);
 	/**
-	 * Computes the tensor chain product of ml and applies it optionally to the tensor op.
+	 * Computes the tensor chain product of tl and applies it optionally to the tensor op.
 	 *
+	 * \param tl the sequence of tensors to compute the product.
+	 * \param if op is not nullptr the functions returns the product of tl and op.
 	 *\param dev the device at::kCPU or at::kCUDA.
+	 * \param chain_opt if true then the function pass the hand to tensor_chain_mul_opt.
+	 * \param contiguous_dense_to_torch If true consecutive/contiguous dense factors will be computed (as intermediary product) through torch::chain_matmul() as it is always done when tl is full of only dense factors (e.g.: if tl = {S1, S2, D3, D4, D5, S6} and the letter D represents a dense factor, S a sparse factor, D3*D4*D4 will be calculated in one call of torch::chain_matmul while the remaining products will be computed one by one. This option and chain_opt are exclusive. If chain_opt is true, this boolean is forced to false. Nota: as far as I've tested torch::chain_matmul can't work with sparse Tensor-s (hence these option and function).
+	 * \param op_on_left: if true op*tl is computed, otherwise tl*op is computed. It is set to true by default because this is the optimal scenario (the transpose product scenario) regarding the column-major order of Faust::Matdense and the row-major order of torch. This default value is consistent with the default value of transpose in other functions.
+	 *
 	 * Returns the result as a Tensor.
 	 */
-	torch::Tensor tensor_chain_mul(const std::vector<torch::Tensor>& ml, const torch::Tensor* op= nullptr, at::DeviceType dev = at::kCPU, const bool chain_opt = false,  const bool contiguous_dense_to_torch = false);
+	torch::Tensor tensor_chain_mul(const std::vector<torch::Tensor>& tl, const torch::Tensor* op= nullptr, at::DeviceType dev = at::kCPU, const bool chain_opt = false,  const bool contiguous_dense_to_torch = false, const bool op_on_left = true);
-	torch::Tensor tensor_chain_mul_opt(const std::vector<torch::Tensor>& ml, const torch::Tensor* op, at::DeviceType dev = at::kCPU);
+	/**
+	 * Computes the tensor chain product of tl and applies it optionally to the tensor op.
+	 *
+	 * This function does the same as tensor_chain_mul except that it optimizes the matrix chain product choosing an order of computation that minimizes the cost.
+	 *
+	 * \param tl the sequence of tensors to compute the product.
+	 * \param if op is not nullptr the functions returns the product of tl and op.
+	 *\param dev the device at::kCPU or at::kCUDA.
+	 * \param op_on_left: if true op*tl is computed, otherwise tl*op is computed. It is set to true by default because this is the optimal scenario (the transpose product scenario) regarding the column-major order of Faust::Matdense and the row-major order of torch. This default value is consistent with the default value of transpose in other functions.
+	 *
+	 * Returns the result as a Tensor.
+	 */
+	torch::Tensor tensor_chain_mul_opt(const std::vector<torch::Tensor>& tl, const torch::Tensor* op, at::DeviceType dev = at::kCPU, const bool op_on_left = true);
 	/**
 	 * Computes the matrix chain product of ml and applies it optionally to the matrix op if provided.
 	 *
-	 * This function converts all the matrices to Tensors before and then computes the tensor product.
+	 * This function converts all the matrices to Tensors before and then computes the tensor product (using tensor_chain_mul(TensorList, Tensor) just above).
 	 *
-	 *\note Complex tensors are not available in libtorch, an exception is thrown when FPP is complex.
+	 * \note Complex tensors are not available in libtorch, an exception is thrown when FPP is complex.
 	 *
 	 * \param on_gpu true ot use the GPU backend, false for the CPU backend (false by default).
 	 * \param clone true to copy the Faust matrices data to create the tensors, false to use the same data without copying (false by default).
@@ -64,7 +90,7 @@ namespace Faust
 	 * Returns the result as a Faust::MatDense.
 	 */
 	template<typename FPP, FDevice D>
-		void tensor_chain_mul(const std::vector<Faust::MatGeneric<FPP,D>*>& ml, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op = nullptr, const bool on_gpu = false, const bool clone = false, const bool chain_opt = false,  const bool contiguous_dense_to_torch = false);
+		void tensor_chain_mul(const std::vector<Faust::MatGeneric<FPP,D>*>& ml, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op = nullptr, const bool on_gpu = false, const bool clone = false, const bool chain_opt = false,  const bool contiguous_dense_to_torch = false, const bool transpose = true);
 	/**
 	 * Computes the matrix chain product of tl and applies it optionally to the matrix op if provided.
@@ -76,11 +102,18 @@ namespace Faust
 	 * \param on_gpu true ot use the GPU backend, false for the CPU backend (false by default).
 	 * \param clone true to copy the Faust matrices data to create the tensors, false to use the same data without copying (false by default).
 	 * \param contiguous_dense_to_torch if true then torch::chain_matmul is used to computed intermediary product of dense contiguous factors. Note that if chain_opt is true, this option can't be true and is internally set to false silently.
+	 *
 	 * Returns the result as a Faust::MatDense.
 	 */
 	template<typename FPP, FDevice D>
-		void tensor_chain_mul(const std::vector<torch::Tensor>& tl, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch);
+		void tensor_chain_mul(const std::vector<torch::Tensor>& tl, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch, const bool transpose = true);
+	/**
+	 * This function display a Tensor list (size and storage format of Tensor-s, DENSE or SPARSE).
+	 *
+	 * \param transpose if true the transpose TensorList of tl is displayed. It's true by default to be consistent with other functions.
+	 */
+	void display_TensorList(std::vector<torch::Tensor>& tl, const bool transpose = true);
 }
 #include "faust_torch.hpp"
 #endif
--- a/src/faust_linear_operator/CPU/faust_torch.hpp
+++ b/src/faust_linear_operator/CPU/faust_torch.hpp
@@ -2,7 +2,7 @@ namespace Faust
 {
 	template<typename FPP, FDevice D>
-		void faust_MatSparse_to_torch_Tensor(const Faust::MatSparse<FPP,D> & spm, torch::Tensor & t, at::DeviceType dev, const bool clone)
+		void convMatSparseToTensor(const Faust::MatSparse<FPP,D> & spm, torch::Tensor & t, at::DeviceType dev, const bool clone, const bool transpose /* = true*/)
 		{
 			torch::Tensor values = torch::from_blob(const_cast<FPP*>(spm.getValuePtr()), {spm.getNonZeros()}, torch::TensorOptions().dtype(torch::kFloat64).device(dev));//.clone();
 			//		cout << "tensor values:" << values << endl;
@@ -28,71 +28,119 @@ namespace Faust
 			col = col.to(torch::kI64); // mandatory conversion because torch forces to use same size types for indices and values (even if indices are integers and values floats)
 			//		cout << "tensor row:" << row << endl;
 			delete [] rows;
-			torch::Tensor indices = at::stack({row, col}, /* dim */ 0);
+			torch::Tensor indices;
-			t = torch::sparse_coo_tensor(indices, values);
+			if(transpose)
+			{
+				//reverse row and col to take the matrix as a transpose mat
+				indices = at::stack({col, row}, /* dim */ 0);
+				t = torch::sparse_coo_tensor(indices, values);
+				t.sparse_resize_({spm.getNbCol(), spm.getNbRow()}, t.sparse_dim(), t.dense_dim());
+			}
+			else
+			{
+				indices = at::stack({row, col}, /* dim */ 0);
+				t = torch::sparse_coo_tensor(indices, values);
+				t.sparse_resize_({spm.getNbRow(), spm.getNbCol()}, t.sparse_dim(), t.dense_dim());
+			}
 			//		cout << "tensor size: " << t.size(0) << " x " << t.size(1) << " t is sparse:" << t.is_sparse() << endl;
+			assert(t._nnz() == spm.getNonZeros());
+			assert(t.size(0) == spm.getNbCol() && t.size(1) == spm.getNbRow());
 		}
 	template<typename FPP, FDevice D>
-		void faust_MatDense_to_torch_Tensor(const Faust::MatDense<FPP,D> & dm, torch::Tensor & t, at::DeviceType dev, const bool clone)
+		void convMatDenseToTensor(const Faust::MatDense<FPP,D> & dm, torch::Tensor & t, at::DeviceType dev, const bool clone, const bool transpose /* = true*/)
 		{
-			t = torch::from_blob(const_cast<FPP*>(dm.getData()), {dm.getNbCol(), dm.getNbRow()},torch::TensorOptions().dtype(torch::kFloat64).device(dev));//.clone();
+			uint64_t nrows, ncols;
-			t = t.t();
-			if(clone)
+			// number of nrows and ncols are inverted because the data is taken as a transpose matrix (Faust::MatDense is column-major order while torch is row-major order)
+			// it saves the need/transpose
+			nrows = dm.getNbCol();
+			ncols = dm.getNbRow();
+			t = torch::from_blob(const_cast<FPP*>(dm.getData()), {nrows, ncols},torch::TensorOptions().dtype(torch::kFloat64).device(dev));//.clone();
+			if(! transpose)
+				// need to transpose when transpose is false! conversion to torch row-major order
+				// while Faust is in col-major order (the conversion is equivalent to a transpose)
+				t = t.t();
+			if(clone && transpose)
 				t = t.clone();
+			// if clone == true && transpose == false // the transpose above already cloned the data
 		}
 	template<typename FPP, FDevice D>
-		void torch_Tensor_to_faust_MatDense(const torch::Tensor & t, Faust::MatDense<FPP,D> & dm)
+		void convTensorToMatDense(const torch::Tensor & t, Faust::MatDense<FPP,D> & dm, const bool transpose /* = true*/)
 		{
-			dm = Faust::MatDense<FPP,Cpu>(t.data_ptr<FPP>(), t.size(1), t.size(0));
+			if(transpose)
-			dm.transpose();
+			{
+				dm = Faust::MatDense<FPP,Cpu>(t.data_ptr<FPP>(), t.size(1), t.size(0));
+			}
+			else
+			{
+				dm = Faust::MatDense<FPP,Cpu>(t.data_ptr<FPP>(), t.size(1), t.size(0));
+				dm.transpose();
+				// need to transpose when transpose is false! conversion from torch row-major order
+				// while Faust is in col-major order (the conversion is equivalent to a transpose)
+			}
 		}
 	template<typename FPP, FDevice D>
-		void faust_matvec_to_torch_TensorList(const std::vector<Faust::MatGeneric<FPP,D>*> & ml, std::vector<torch::Tensor> &tl, at::DeviceType dev, const bool clone)
+		void convMatGenListToTensorList(const std::vector<Faust::MatGeneric<FPP,D>*> & ml, std::vector<torch::Tensor> &tl, at::DeviceType dev, const bool clone /* = false*/, const bool transpose /* = true*/)
 		{
 			const Faust::MatSparse<FPP,D> *spm;
 			const Faust::MatDense<FPP,D> *dm;
-//			torch::Tensor t;
 			tl.resize(ml.size());
-			int i = 0;
+			int i;
-			for(auto m : ml)
+			if(transpose)
+			{
+				i = tl.size()-1; // transpose order == reverse order
+				for(auto m : ml)
+				{
+					if(spm = dynamic_cast<Faust::MatSparse<FPP,D>*>(m))
+						convMatSparseToTensor(*spm, tl[i--], dev, clone);
+					else if(dm = dynamic_cast<Faust::MatDense<FPP,D>*>(m))
+						convMatDenseToTensor(*dm, tl[i--], dev, clone);
+				}
+			}
+			else
 			{
-				if(spm = dynamic_cast<Faust::MatSparse<FPP,D>*>(m))
+				i = 0;
-					faust_MatSparse_to_torch_Tensor(*spm, tl[i++], dev, clone);
+				for(auto m : ml)
-//					faust_MatSparse_to_torch_Tensor(*spm, t, dev, clone);
+				{
-				else if(dm = dynamic_cast<Faust::MatDense<FPP,D>*>(m))
+					if(spm = dynamic_cast<Faust::MatSparse<FPP,D>*>(m))
-					faust_MatDense_to_torch_Tensor(*dm, tl[i++], dev, clone);
+						convMatSparseToTensor(*spm, tl[i++], dev, clone);
-//					faust_MatDense_to_torch_Tensor(*dm, t, dev, clone);
+					else if(dm = dynamic_cast<Faust::MatDense<FPP,D>*>(m))
-//				tl.push_back(t);
+						convMatDenseToTensor(*dm, tl[i++], dev, clone);
+				}
 			}
 		}
-	torch::Tensor tensor_chain_mul(const std::vector<torch::Tensor>& ml, const torch::Tensor* op, at::DeviceType dev, const bool chain_opt, const bool contiguous_dense_to_torch)
+	torch::Tensor tensor_chain_mul(const std::vector<torch::Tensor>& tl, const torch::Tensor* op, at::DeviceType dev, const bool chain_opt, const bool contiguous_dense_to_torch, const bool op_on_left /*=true*/)
 	{
 		bool all_dense = true;
-		std::vector<torch::Tensor> mlc;
+		std::vector<torch::Tensor> tlc;
-		for(auto t: ml)
+		for(auto t: tl)
 		{
 			all_dense &= !t.is_sparse();
-			mlc.push_back(t);
+			tlc.push_back(t);
 		}
 		if(op)
 		{
 			all_dense &= !op->is_sparse();
-			mlc.push_back(*op);
+			if(op_on_left)
+				tlc.insert(tlc.begin(), *op);
+			else
+				tlc.push_back(*op);
 		}
 		if(all_dense)
-			return torch::chain_matmul(mlc); //chain_opt is useless because I suppose torch does its own chain opt.
+			return torch::chain_matmul(tlc); //chain_opt is useless because I suppose torch does its own chain opt.
 		if(chain_opt)
-			return std::move(tensor_chain_mul_opt(mlc, nullptr, dev));
+			return std::move(tensor_chain_mul_opt(tlc, nullptr, dev));
-		auto it = mlc.end()-1;
+		auto it = tlc.end()-1;
 		auto res = *(it);
 		if(res.is_sparse())
 			res = res.to_dense();
 		std::vector<torch::Tensor> dense_contiguous_facts;
-		while(it != mlc.begin())
+		while(it != tlc.begin())
 		{
 			auto f = *(--it);
 			if(f.is_sparse())
@@ -112,12 +160,20 @@ namespace Faust
 				else
 					res = torch::matmul(f, res);
 		}
-		assert(res.size(0) == ml.size(0));
+		if(contiguous_dense_to_torch && dense_contiguous_facts.size() > 0)
-		assert(op == nullptr && res.size(1) == ml.size(1) || res.size(1) == op->size(1));
+		{
+			//multiply chain of dense tensors at the end/start of tlc
+			dense_contiguous_facts.push_back(res);
+			res = torch::chain_matmul(dense_contiguous_facts);
+			dense_contiguous_facts.erase(dense_contiguous_facts.begin(), dense_contiguous_facts.end());
+		}
+		// don't worry assert is enabled only in debug mode (when DEBUG is defined)
+		assert((op != nullptr && op_on_left && res.size(0) == op.size(0)) || ((! op_on_left || op == nullptr) && res.size(0) == tl[0].size(0)) || op == nullptr);
+		assert(((op == nullptr || op_on_left) && res.size(1) == (*(tl.end()-1)).size(1)) || op != nullptr && res.size(1) == op->size(1));
 		return std::move(res); //explicit move but should work auto because Tensor class overrides move operator= and ctor
 	}
-	torch::Tensor tensor_chain_mul_opt(const std::vector<torch::Tensor>& ml, const torch::Tensor* op, at::DeviceType dev)
+	torch::Tensor tensor_chain_mul_opt(const std::vector<torch::Tensor>& ml, const torch::Tensor* op, at::DeviceType dev, const bool op_on_left /* = true */)
 	{
 		// cost to apply a on b
 		auto cost = [](const torch::Tensor &a, const torch::Tensor &b)
@@ -125,7 +181,7 @@ namespace Faust
 			uint64_t a_cost = a.size(0)*a.size(1);
 			uint64_t b_cost;
 			if(b.is_sparse())
-				b_cost = b._nnz();	
+				b_cost = b._nnz();
 			else
 				b_cost = b.size(1);
 			return a_cost*b_cost;
@@ -149,7 +205,10 @@ namespace Faust
 		std::vector<const torch::Tensor*> mlc;
 		for(int i=0;i<ml.size();i++) mlc.push_back(&ml[i]);
 		if(op != nullptr)
-			mlc.push_back(op);
+			if(op_on_left)
+				mlc.insert(mlc.begin(), op);
+			else
+				mlc.push_back(op);
 		std::vector<uint64_t> costs(mlc.size()-1);
 		for(int i=0;i<costs.size();i++)
 			costs[i] = cost(*mlc[i], *mlc[i+1]);
@@ -158,7 +217,6 @@ namespace Faust
 		res_list.push_back(res);
 		while(mlc.size() > 1)
 		{
-//			for (int i = 0; i < mlc.size(); i++) cout << mlc[i] << "[" << mlc[i]->size(0) << "x" << mlc[i]->size(1) << "] "; cout << endl;
 			auto i = argmin(costs);
 			auto f1 = mlc[i];
 			auto f2 = mlc[i+1];
@@ -167,7 +225,6 @@ namespace Faust
 				res = new torch::Tensor();
 				res_list.push_back(res);
 			}
-//			cout << "argmin i:" << i << endl;
 			if(f2->is_sparse())
 				if(f1->is_sparse())
 					*res = at::_sparse_mm(*f1, f2->to_dense());
@@ -196,15 +253,15 @@ namespace Faust
 	}
 	template<typename FPP, FDevice D>
-		void tensor_chain_mul(const std::vector<Faust::MatGeneric<FPP,D>*>& ml, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch)
+		void tensor_chain_mul(const std::vector<Faust::MatGeneric<FPP,D>*>& ml, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch, const bool transpose /* = true */)
 		{
 			std::vector<torch::Tensor> tl;
-			faust_matvec_to_torch_TensorList(ml, tl, on_gpu?at::kCUDA:at::kCPU, clone);
+			convMatGenListToTensorList(ml, tl, on_gpu?at::kCUDA:at::kCPU, clone, transpose);
-			tensor_chain_mul(tl, out, op, on_gpu, clone, chain_opt, contiguous_dense_to_torch);
+			tensor_chain_mul(tl, out, op, on_gpu, clone, chain_opt, contiguous_dense_to_torch, transpose);
 		}
 	template<typename FPP, FDevice D>
-		void tensor_chain_mul(const std::vector<torch::Tensor>& tl, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch)
+		void tensor_chain_mul(const std::vector<torch::Tensor>& tl, Faust::MatDense<FPP,Cpu> & out, const Faust::MatGeneric<FPP,D>* op, const bool on_gpu, const bool clone, const bool chain_opt, const bool contiguous_dense_to_torch, const bool transpose /* = true */)
 	{
 		torch::Tensor top, tres;
 		const Faust::MatSparse<FPP,D> *spm;
@@ -212,15 +269,22 @@ namespace Faust
 		if(op)
 		{
 			if(spm = dynamic_cast<const Faust::MatSparse<FPP,D>*>(op))
-				faust_MatSparse_to_torch_Tensor(*spm, top, on_gpu?at::kCUDA:at::kCPU, clone);
+				convMatSparseToTensor(*spm, top, on_gpu?at::kCUDA:at::kCPU, clone, transpose);
 			else if(dm = dynamic_cast<const Faust::MatDense<FPP,D>*>(op))
-				faust_MatDense_to_torch_Tensor(*dm, top, on_gpu?at::kCUDA:at::kCPU, clone);
+				convMatDenseToTensor(*dm, top, on_gpu?at::kCUDA:at::kCPU, clone, transpose);
-			tres = tensor_chain_mul(tl, &top, on_gpu?at::kCUDA:at::kCPU, chain_opt, contiguous_dense_to_torch);
+			tres = tensor_chain_mul(tl, &top, on_gpu?at::kCUDA:at::kCPU, chain_opt, contiguous_dense_to_torch, transpose /* op_on_left if transpose */);
 		}
 		else
-			tres = tensor_chain_mul(tl, static_cast<torch::Tensor*>(nullptr), on_gpu?at::kCUDA:at::kCPU, chain_opt, contiguous_dense_to_torch);
+			tres = tensor_chain_mul(tl, static_cast<torch::Tensor*>(nullptr), on_gpu?at::kCUDA:at::kCPU, chain_opt, contiguous_dense_to_torch, transpose /* op_on_left if transpose */);
-		out = Faust::MatDense<FPP,Cpu>(tres.data_ptr<FPP>(), tres.size(1), tres.size(0));
+		convTensorToMatDense(tres, out, transpose);
-		out.transpose();
+	}
+	void display_TensorList(std::vector<torch::Tensor>& tl, const bool transpose /*= true*/)
+	{
+		if(transpose)
+			for (int i = tl.size()-1; i >= 0; i--) {cout << "Tensor: " << tl.size()-1-i << " [" << tl[i].size(1) << "x" << tl[i].size(0) << "] " << (tl[i].is_sparse()?"SPARSE":"DENSE"); cout << endl;}
+		else
+			for (int i = 0; i < tl.size(); i++) {cout << "Tensor: " << i << " [" << tl[i].size(0) << "x" << tl[i].size(1) << "] " << (tl[i].is_sparse()?"SPARSE":"DENSE"); cout << endl;};
 	}
 }