faust_butterfly.hpp

#include <Eigen/SVD>
#include "faust_MatDense.h"
#include "faust_TransformHelper.h"
#include <vector>
#include "faust_openmp.h"
#ifdef OMP_ENABLED
#include "omp.h"
#endif
#include <cmath>
#include <limits>

namespace Faust
{
	void print_classes(vector<vector<faust_unsigned_int>>& classes)
	{
		for(auto c: classes)
		{
			std::cout << "{";
			for(auto k: c)
			{
				std::cout << k << ", ";
			}
			std::cout << "}" << std::endl;
		}
	}

	void print_classes(vector<vector<faust_unsigned_int>*>& classes)
	{
		for(auto c: classes)
		{
			std::cout << "{";
			for(auto k: *c)
			{
				std::cout << k << ", ";
			}
			std::cout << "}" << " ";
		}
		std::cout << std::endl;
	}
	template<typename FPP>
		void retrieveCEC(const Faust::MatDense<FPP, Cpu>& s1, const Faust::MatDense<FPP, Cpu>& s2, vector<vector<faust_unsigned_int>*> &cec, vector<vector<faust_unsigned_int>*> &noncec)
		{
			//TODO: this function should be deleted because lifting_two_layers_factorization is faster
			faust_unsigned_int r = s1.getNbCol(); // == s2.getNbCol()
			//	Vect<FPP, Cpu> c1(r), c2(r), r1(r), r2(r);
			vector<bool> processed_ids(r, false);
			auto eps = 1e-6;
#ifdef OMP_ENABLED
			int nthreads = 8;
#else
			int nthreads = 1;
#endif
			auto th_class = new vector<faust_unsigned_int>[nthreads];
			for(int i=0;i<r;i++)
			{
				if(! processed_ids[i])
				{
					// create a new equiv. class
					auto class_ = new vector<faust_unsigned_int>();
					class_->push_back(i);
#pragma omp parallel for num_threads(nthreads)
					for(int j=i+1;j<r;j++)
					{
						if(! processed_ids[j])
						{
							if(s1.eq_cols(s2, i, j, eps) && s1.eq_rows(s2, i, j, eps))
							{
#ifdef OMP_ENABLED
								th_class[omp_get_thread_num()].push_back(j);
#else
								th_class[0].push_back(j);
#endif
								processed_ids[j] = true;
							}
						}
					}
					for(int i=0;i<nthreads;i++)
					{
						for(auto k: th_class[i])
							class_->push_back(k);
						th_class[i].clear();
					}
					// class is in cec or noncec
					if(min(Faust::fabs(s1.sum_row(i)), Faust::fabs(s1.sum_col(i))) <= class_->size())
					{
						cec.push_back(class_);
					}
					else
					{
						noncec.push_back(class_);
					}
					processed_ids[i] = true;
				}
			}
			delete[] th_class;
		}

	template<typename FPP>
		void lifting_two_layers_factorization(const Faust::MatDense<FPP, Cpu>& A, const Faust::MatSparse<FPP, Cpu>& s1, const Faust::MatSparse<FPP, Cpu>& s2, Faust::MatDense<FPP, Cpu>& X, Faust::MatDense<FPP, Cpu>& Y)
		{
			// TODO: this function should be deleted because lifting_two_layers_factorization(MatGeneric, ...) def is faster
			int r = s1.getNbCol();
			Faust::MatDense<FPP, Cpu> subA, u, v;
			X.resize(s1.getNbRow(), s1.getNbCol());
			Y.resize(s2.getNbRow(), s2.getNbCol());
			X.setZeros();
			Y.setZeros();
			std::vector<MatDense<FPP, Cpu>> U(r), V(r);
			std::vector<std::vector<int>> rows(r), cols(r);
			#pragma omp parallel for schedule(dynamic) private(subA)
			for(int t=0;t<r;t++)
			{
				rows[t] = s1.col_nonzero_inds(t);
				cols[t] = s2.row_nonzero_inds(t);
				A.submatrix(rows[t], cols[t], subA);
				//#define BUTTERFLY_APPROX_RANK1
#ifdef BUTTERFLY_APPROX_RANK1
				subA.approx_rank1(U[t], V[t]);
#else
				if(A.getNbRow() >= (1 << 14) && (std::is_same<FPP, double>::value || std::is_same<FPP, std::complex<double>>::value))
				{
					// use jacobi svd as a workaround to this bug https://gitlab.com/libeigen/eigen/-/issues/1723
					// TODO: remove when it'll be fixed
					Eigen::JacobiSVD<Eigen::Matrix<FPP, Eigen::Dynamic, Eigen::Dynamic>> svd;
					subA.initJacobiSVD(svd);
					subA.best_low_rank(1, U[t], V[t], svd);
				}
				else
					subA.best_low_rank(1, U[t], V[t]);
#endif
			}
			#pragma omp parallel for schedule(dynamic)
			for(int t=0;t<r;t++)
			{
				X.set_col_coeffs(t, rows[t], U[t], 0);
				Y.set_row_coeffs(t, cols[t], V[t], 0);
			}
		}

	template<typename FPP>
		void lifting_two_layers_factorization(const Faust::MatDense<FPP, Cpu>& A, const Faust::MatDense<FPP, Cpu>& s1, const Faust::MatDense<FPP, Cpu>& s2, Faust::MatDense<FPP, Cpu>& X, Faust::MatDense<FPP, Cpu>& Y)
		{
			// TODO: this function should be deleted because lifting_two_layers_factorization(MatGeneric, ...) def is faster
			int r = s1.getNbCol();
			Faust::MatDense<FPP, Cpu> subA, u, v;
			X.resize(s1.getNbRow(), s1.getNbCol());
			Y.resize(s2.getNbRow(), s2.getNbCol());
			X.setZeros();
			Y.setZeros();
			std::vector<MatDense<FPP, Cpu>> U(r), V(r);
			std::vector<std::vector<int>> rows(r), cols(r);
			#pragma omp parallel for schedule(dynamic) private(subA)
			for(int t=0;t<r;t++)
			{
				rows[t] = s1.col_nonzero_inds(t);
				cols[t] = s2.row_nonzero_inds(t);
				A.submatrix(rows[t], cols[t], subA);
				//#define BUTTERFLY_APPROX_RANK1
#ifdef BUTTERFLY_APPROX_RANK1
				subA.approx_rank1(U[t], V[t]);
#else
				subA.best_low_rank(1, U[t], V[t]);
#endif
			}
			#pragma omp parallel for schedule(dynamic)
			for(int t=0;t<r;t++)
			{
				X.set_col_coeffs(t, rows[t], U[t], 0);
				Y.set_row_coeffs(t, cols[t], V[t], 0);
			}
		}

	template<typename FPP>
		void lifting_two_layers_factorization(const Faust::MatGeneric<FPP, Cpu>& A, const Faust::MatSparse<FPP, Cpu>& s1, const Faust::MatSparse<FPP, Cpu>& s2, Faust::MatSparse<FPP, Cpu>& X, Faust::MatSparse<FPP, Cpu>& Y)
		{
			const Faust::MatDense<FPP, Cpu>* dsA = dynamic_cast<const Faust::MatDense<FPP, Cpu>*>(&A);
			const Faust::MatSparse<FPP, Cpu>* spA;
			if(dsA == nullptr)
			{
				spA = dynamic_cast<const Faust::MatSparse<FPP, Cpu>*>(&A);
				if(spA == nullptr)
					throw std::runtime_error("lifting_two_layers_factorization A must be MatDense or MatSparse.");
			}
			int r = s1.getNbCol();
			MatDense<FPP, Cpu> subA;
			X.resize(s1.getNbRow(), s1.getNbCol());
			Y.resize(s2.getNbRow(), s2.getNbCol());
			X.setZeros();
			Y.setZeros();

			std::vector<MatDense<FPP, Cpu>> U(r), V(r);
			std::vector<std::vector<int>> rows(r), cols(r);
			#pragma omp parallel for schedule(dynamic) private(subA)
			for(int t=0;t<r;t++)
			{
				rows[t] = s1.col_nonzero_inds(t);
				cols[t] = s2.row_nonzero_inds(t);
				if(dsA)
					dsA->submatrix(rows[t], cols[t], subA);
				else
					spA->submatrix(rows[t], cols[t], subA);
				//#define BUTTERFLY_APPROX_RANK1
#ifdef BUTTERFLY_APPROX_RANK1
				subA.approx_rank1(U[t], V[t]);
#else
				if(A.getNbRow() >= (1 << 14) && (std::is_same<FPP, double>::value || std::is_same<FPP, std::complex<double>>::value))
				{
					// use jacobi svd as a workaround to this bug https://gitlab.com/libeigen/eigen/-/issues/1723
					// TODO: remove when it'll be fixed
					Eigen::JacobiSVD<Eigen::Matrix<FPP, Eigen::Dynamic, Eigen::Dynamic>> svd;
					subA.initJacobiSVD(svd);
					subA.best_low_rank(1, U[t], V[t], svd);
				}
				else
					subA.best_low_rank(1, U[t], V[t]);
#endif
			}
			std::vector<Eigen::Triplet<FPP>> XtripletList;
			std::vector<Eigen::Triplet<FPP>> YtripletList;
			for(int t=0;t<r;t++)
			{
				for(int i=0;i<rows[t].size();i++)
						XtripletList.push_back(Eigen::Triplet<FPP>(rows[t][i], t, U[t].getData()[i]));
				for(int i=0;i<cols[t].size();i++)
						YtripletList.push_back(Eigen::Triplet<FPP>(t, cols[t][i], V[t](0,i)));
			}
			X = MatSparse<FPP, Cpu>(XtripletList, s1.getNbRow(), s1.getNbCol());
			Y = MatSparse<FPP, Cpu>(YtripletList, s2.getNbRow(), s2.getNbCol());
		}

	template<typename FPP>
		void solveDTO(const Faust::MatDense<FPP, Cpu>& A, const Faust::MatDense<FPP, Cpu>& s1, const Faust::MatDense<FPP, Cpu>& s2, Faust::MatDense<FPP, Cpu>& X, Faust::MatDense<FPP, Cpu>& Y)
		{
			vector<vector<faust_unsigned_int>*> cec, noncec;
			Faust::MatDense<FPP, Cpu> submat, bestx, besty;
			X.resize(s1.getNbRow(), s1.getNbCol());
			Y.resize(s2.getNbRow(), s2.getNbCol());
			X.setZeros();
			Y.setZeros();
			retrieveCEC(s1, s2, cec, noncec);
			for(auto ce: cec)
			{
//				Faust::MatDense<FPP, Cpu> submat;
				auto r = (*ce)[0];
				auto RP = s1.col_nonzero_inds(r);
				auto CP = s2.row_nonzero_inds(r);
				if(RP.size() == 0 || CP.size() == 0)
					continue;
				//TODO: maybe some refactoring is possible between the for loops
				if(ce->size() == RP.size() || ce->size() == RP.size())
				{
					noncec.push_back(ce);
					continue;
				}
				A.submatrix(RP, CP, submat);
				if(ce->size() >= RP.size())
				{
					MatDense<FPP, Cpu> eye(RP.size(), ce->size());
					eye.setEyes();
					for(int i=0;i< ce->size();i++)
					{
						auto col_id = (*ce)[i];
						X.set_col_coeffs(col_id, RP, eye, i);
						if(i < RP.size())
						{
							auto row_id = (*ce)[i];
							Y.set_row_coeffs(row_id, CP, submat, i);
						}
					}
				}
				else
				{
					MatDense<FPP, Cpu> eye(ce->size(), CP.size());
					eye.setEyes();
					for(int i=0;i< ce->size();i++)
					{

						if(i < CP.size())
						{
							auto col_id = (*ce)[i];
							X.set_col_coeffs(col_id, RP, eye, i);
						}
						auto row_id = (*ce)[i];
						Y.set_row_coeffs(row_id, CP, submat, i);
					}

				}
			}
//			#pragma omp parallel for // Tested but non-efficient because of solveDTO calls are already parallelized
//			for(int i=0;i<noncec.size();i++)
//			{
//				Faust::MatDense<FPP, Cpu> submat, bestx, besty;
//				auto ce = noncec[i];
			for(auto ce: noncec)
			{
				auto r = (*ce)[0];
				auto RP = s1.col_nonzero_inds(r);
				auto CP = s2.row_nonzero_inds(r);
				A.submatrix(RP, CP, submat);
#ifdef BUTTERFLY_APPROX_RANK1
				submat.approx_rank1(bestx, besty);
#else
				submat.best_low_rank(ce->size(), bestx, besty);
#endif
				for(int i=0;i< ce->size();i++)
				{

					auto col_id = (*ce)[i];
					X.set_col_coeffs(col_id, RP, bestx, i);
					auto row_id = (*ce)[i];
					Y.set_row_coeffs(row_id, CP, besty, i);
				}
			}
		}

	template<typename FPP>
		void butterfly_support(int size, Faust::MatDense<FPP, Cpu> & out)
		{
			assert(size % 2 == 0);
			Faust::MatDense<FPP, Cpu> ones(2, 2);
			auto s = size / 2;
			Faust::MatDense<FPP, Cpu> id(s, s);
			ones.setOnes();
			id.setEyes();
			Faust::kron(ones, id, out);
		}

	template<typename FPP>
		std::vector<Faust::MatSparse<FPP, Cpu>*> butterfly_support(int nfactors, Faust::MatSparse<FPP, Cpu>* P/*=nullptr*/)
		{
			// WARNING: free the MatSparse after calling this function
			std::vector<Faust::MatSparse<FPP, Cpu>*> out;
			auto size = 1 << nfactors;
			Faust::MatDense<FPP, Cpu> kernel, id;
			Faust::MatDense<FPP, Cpu> support;
			Faust::MatSparse<FPP, Cpu> * sp_support;
			for(int i=0;i < nfactors; i++)
			{
				butterfly_support(1 << (nfactors-i), kernel);
				id.resize(1 << i, 1 << i);
				id.setEyes();
				Faust::kron(id, kernel, support);
				sp_support = new Faust::MatSparse<FPP, Cpu>(support);
				out.push_back(sp_support);
			}
			if(P == nullptr)
			{
				if(! std::is_same<FPP, Real<FPP>>::value)
					bit_reversal_factor(nfactors, out);
			}
			else
				out.push_back(P);
			return out;
		}

	// TODO: specializations should be in .cpp file
	template<>
		void bit_reversal_factor<double>(int nfactors, Faust::MatSparse<double, Cpu>*& out)
		{
			//nothing to do for double matrices
		}

	template<>
		void bit_reversal_factor<float>(int nfactors, Faust::MatSparse<float, Cpu>*& out)
		{
			//nothing to do for float matrices
		}

	template<>
		void bit_reversal_factor<std::complex<double>>(int nfactors, Faust::MatSparse<std::complex<double>, Cpu>*& out)
		{
			// bit reversal permutation factor
			unsigned int dim_size = 1u << nfactors, L, r, L_over_2, L_times_2;
			unsigned int* index = new unsigned int[dim_size];
			unsigned int* new_index = new unsigned int[dim_size];
			for(unsigned int i = 0; i < dim_size; i++)
				index[i] = i;
			memcpy(new_index, index, sizeof(unsigned int)*dim_size);
			bit_rev_permu(nfactors, new_index, false);
			std::vector<std::complex<Real<double>>> ones(dim_size);
			for(typename std::vector<std::complex<double>>::iterator it=ones.begin(); it != ones.end(); it++)
				*it = std::complex<double>(1.0);
			out = new MatSparse<std::complex<double>,Cpu>(index, new_index, ones, dim_size, dim_size);
		}

	template<>
		void bit_reversal_factor<double>(int nfactors, std::vector<Faust::MatSparse<double, Cpu>*> &out)
		{
			//nothing to do for double matrices
		}

	template<>
		void bit_reversal_factor<float>(int nfactors, std::vector<Faust::MatSparse<float, Cpu>*> &out)
		{
			//nothing to do for float matrices
		}

	template<>
		void bit_reversal_factor<std::complex<double>>(int nfactors, std::vector<Faust::MatSparse<std::complex<double>, Cpu>*>& out)
		{
			MatSparse<std::complex<double>,Cpu> *P;
			bit_reversal_factor(nfactors, P);
			out.push_back(P);
		}

	template<typename FPP>
		Faust::TransformHelper<FPP, Cpu>* butterfly_hierarchical(const Faust::MatDense<FPP, Cpu>& A, const std::vector<Faust::MatSparse<FPP, Cpu>*> &supports, const ButterflyFactDir& dir/*=RIGHT*/)
		{
			using ID_FUNC = std::function<int(int)>;
			auto th = new TransformHelper<FPP, Cpu>();
			int i, j;
			Faust::MatDense<FPP, Cpu> s2;
			std::vector<Faust::MatDense<FPP, Cpu>> s2_vec(supports.size());
			Faust::MatDense<FPP, Cpu> X, Y;
			Faust::MatDense<FPP, Cpu> mat = A;
			assert(dir == RIGHT || dir == LEFT);
			ID_FUNC next_s1_id_left = [&supports](int i){if(i > 1) return i-1; else return -1;};
			ID_FUNC next_s1_id_right = [&supports](int i){if(i < supports.size()-2) return i+1; else return -1;};
			ID_FUNC next_s1_id = dir == RIGHT?next_s1_id_right:next_s1_id_left;
			ID_FUNC next_s2_id_left = [&supports, &i](int j){if(j<i-1) return j+1; else return -1;};
			ID_FUNC next_s2_id_right = [&supports, &i](int j){if(j>i+1) return j-1; else return -1;};
			ID_FUNC next_s2_id = RIGHT==dir?next_s2_id_right:next_s2_id_left;

			s2.resize(A.getNbRow(), A.getNbCol());
			s2.setEyes();
			i = dir == RIGHT?0:supports.size()-1;
			j = dir == RIGHT?supports.size()-1:0;
			do
			{
				supports[j]->multiply(s2, 'N');
				s2.setNZtoOne();
				s2_vec[dir == RIGHT?j-1:j+1] = s2;
			}
			while((j = next_s2_id(j)) > -1);
			do
			{
				auto s1 = Faust::MatDense<FPP, Cpu>(*supports[i]);
				if(dir == RIGHT)
				{
					solveDTO(mat, s1, s2_vec[i], X, Y);
					th->push_back(&X);
				}
				else
				{
					solveDTO(mat, s2_vec[i], s1, Y, X);
					th->push_first(&X);
				}
				mat = Y;
				std::cout << "factorization #" << (dir==RIGHT?i+1:supports.size()-i) << std::endl;
			}
			while((i = next_s1_id(i)) > -1);
			if(dir == RIGHT)
				th->push_back(&mat);
			else
				th->push_first(&mat);
			return th;
		}

	template<typename FPP>
		Faust::TransformHelper<FPP, Cpu>* butterfly_hierarchical_balanced(const Faust::MatDense<FPP, Cpu>& A, const std::vector<Faust::MatSparse<FPP, Cpu>*> &supports, const FactMeth meth/*=LIFTING*/)
		{
			//TODO: refactor with other prototypes
			if(meth != LIFTING)
				throw std::runtime_error("Only FactMeth::LIFTING is supported here, use the other prototype for DTO.");
			auto th = new TransformHelper<FPP, Cpu>();
			double l2_nfacts = std::log2((double)supports.size());
			size_t d = ceil(l2_nfacts); // depth of factorization tree
			/** initialize the support tree structure (the size of each vector -- one per level of tree) **/
			std::vector<std::vector<MatSparse<FPP, Cpu>>> tree_supports(d);
			// set supports for each tree level
			size_t i=0;
			while(i<d-1)
			{
				tree_supports[i] = std::vector<MatSparse<FPP, Cpu>>(1 << (i+1));
//				std::cout << "tree_supports["<<i<<"].size():" << tree_supports[i].size() << std::endl;
				i++;
			}
			// the number of leafs of tree is not necessarily a power of two (remaining factorization(s))
			size_t n = supports.size();
			if(d > 1)
				tree_supports[d-1] = std::vector<MatSparse<FPP, Cpu>>(2*(n - tree_supports[d-2].size()));
			else // factorization in two factors only
				tree_supports[d-1] = std::vector<MatSparse<FPP, Cpu>>(2);
//			std::cout << "tree_supports[" << d-1 << "].size():" << tree_supports[d-1].size() << std::endl;
			/** initialize the supports in the tree starting from the leafs -- that are the supports arguments **/
			for(int i=d-1;i>=0;i--)
			{
				#pragma omp parallel for schedule(dynamic)
				for(int j = 0;j < tree_supports[i].size(); j++)
				{
					if(i == d - 1)
					{
						// tree_supports[i][j] is a leaf on the last level
//						std::cout << "tree_supports[" << i << "," << j << "] received support[" << j << "]" << std::endl;
						tree_supports[i][j] = *supports[j];
					}
					else if (2*(j+1) > tree_supports[i+1].size())
					{
						// tree_supports[i][j] is a leaf
//						std::cout << "tree_supports[" << i << "," << j << "] received support[" << j+tree_supports[i+1].size()/2 << "]" << std::endl;
						tree_supports[i][j] = *supports[j+tree_supports[i+1].size()/2];
					}
					else
					{
						// tree_supports[i][j] is not a leaf so it's calculated using child nodes
						// tree_supports[i][j] = tree_supports[i+1][2*j] * tree_supports[i+1][2*j+]
//						std::cout << "tree_supports[" << i << "," << j << "] received tree_supports[" << i+1 << "," << 2*j << "]*tree_supports[" << i+1 << "," << 2*j+1 << "]" << std::endl;
						MatDense<FPP, Cpu> s;
						gemm_gen(tree_supports[i+1][2*j], tree_supports[i+1][2*j+1], s, FPP(1.0), FPP(0), 'N', 'N');
						tree_supports[i][j] = s;
					}

				}
			}
			// the supports are now set for all factorization tree nodes
			// factorize according to the tree of supports, level by level
			std::vector<MatSparse<FPP, Cpu>> next_lvl_facs;
			std::vector<MatSparse<FPP, Cpu>> input_facs;
			// first factorization
			next_lvl_facs.resize(2);
			lifting_two_layers_factorization(A, tree_supports[0][0], tree_supports[0][1], next_lvl_facs[0], next_lvl_facs[1]);
#ifdef BUTTERFLY_BALANCED_CHECK_ERRORS
			std::cout << "computing error... lvl1" << std::endl;
			MatDense<FPP, Cpu> L = next_lvl_facs[0];
			MatDense<FPP, Cpu> R = next_lvl_facs[1];
			L.multiplyRight(R);
			L -= A;
			std::cout << "err:" << L.norm()/A.norm() << std::endl;
#endif
			input_facs = next_lvl_facs;

			for(size_t i=0;i<d-1;i++)
			{
				next_lvl_facs.resize(tree_supports[i+1].size());
				int j_bound = std::min(tree_supports[i].size(), tree_supports[i+1].size()/2);
				#pragma omp parallel for schedule(static)
				for(int j=0;j < j_bound;j++)
					lifting_two_layers_factorization(input_facs[j], tree_supports[i+1][2*j], tree_supports[i+1][2*j+1], next_lvl_facs[j*2], next_lvl_facs[j*2+1]);
#ifdef BUTTERFLY_BALANCED_CHECK_ERRORS
				for(int j=0;j < j_bound;j++)
				{
					std::cout << "computing error... lvl" << (i+2) << " factorization of index" << j << " factor" << std::endl;
					MatDense<FPP, Cpu> A = input_facs[j];
					MatDense<FPP, Cpu> L = next_lvl_facs[j*2];
					MatDense<FPP, Cpu> R = next_lvl_facs[j*2+1];
					L.multiplyRight(R);
					L -= A;
					std::cout << "err:" << L.norm()/A.norm() << std::endl;
				}
#endif

				if(i < d - 2)
				{
					input_facs = next_lvl_facs;
				}
			}
			// build the resulting Faust
			for(auto f: next_lvl_facs)
			{
				th->push_back(new MatSparse<FPP, Cpu>(f), false, false);
			}
			if(next_lvl_facs.size() < supports.size())
				for(int i=next_lvl_facs.size()/2;i<input_facs.size();i++)
				{
					th->push_back(new MatSparse<FPP, Cpu>(input_facs[i]), false, false);
				}
			return th;
		}


	template<typename FPP>
		Faust::TransformHelper<FPP, Cpu>* butterfly_hierarchical_balanced_dense(const Faust::MatDense<FPP, Cpu>& A, const std::vector<Faust::MatSparse<FPP, Cpu>*> &supports, const FactMeth meth/*=LIFTING*/)
		{
			//TODO: this function should be deleted because it has shown to be very inefficient relatively to its MatSparse counterpart
			auto th = new TransformHelper<FPP, Cpu>();
			double l2_nfacts = std::log2((double)supports.size());
			size_t d = ceil(l2_nfacts); // depth of factorization tree
			/** initialize the support tree structure (the size of each vector -- one per level of tree) **/
			std::vector<std::vector<MatDense<FPP, Cpu>>> tree_supports(d);
			// set supports for each tree level
			size_t i=0;
			while(i<d-1)
			{
				tree_supports[i] = std::vector<MatDense<FPP, Cpu>>(1 << (i+1));
//				std::cout << "tree_supports["<<i<<"].size():" << tree_supports[i].size() << std::endl;
				i++;
			}
			// the number of leafs of tree is not necessarily a power of two (remaining factorization(s))
			size_t n = supports.size();
			if(d > 1)
				tree_supports[d-1] = std::vector<MatDense<FPP, Cpu>>(2*(n - tree_supports[d-2].size()));
			else // factorization in two factors only
				tree_supports[d-1] = std::vector<MatDense<FPP, Cpu>>(2);
//			std::cout << "tree_supports[" << d-1 << "].size():" << tree_supports[d-1].size() << std::endl;
			/** initialize the supports in the tree starting from the leafs -- that are the supports arguments **/
			for(int i=d-1;i>=0;i--)
			{
				#pragma omp parallel for schedule(dynamic)
				for(int j = 0;j < tree_supports[i].size(); j++)
				{
					if(i == d - 1)
					{
						// tree_supports[i][j] is a leaf on the last level
//						std::cout << "tree_supports[" << i << "," << j << "] received support[" << j << "]" << std::endl;
						tree_supports[i][j] = *supports[j];
					}
					else if (2*(j+1) > tree_supports[i+1].size())
					{
						// tree_supports[i][j] is a leaf
//						std::cout << "tree_supports[" << i << "," << j << "] received support[" << j+tree_supports[i+1].size()/2 << "]" << std::endl;
						tree_supports[i][j] = *supports[j+tree_supports[i+1].size()/2];
					}
					else
					{
						// tree_supports[i][j] is not a leaf so it's calculated using child nodes
						// tree_supports[i][j] = tree_supports[i+1][2*j] * tree_supports[i+1][2*j+]
//						std::cout << "tree_supports[" << i << "," << j << "] received tree_supports[" << i+1 << "," << 2*j << "]*tree_supports[" << i+1 << "," << 2*j+1 << "]" << std::endl;
						gemm(tree_supports[i+1][2*j], tree_supports[i+1][2*j+1], tree_supports[i][j], FPP(1.0), FPP(0), 'N', 'N');
					}

				}
			}
			// the supports are now set for all factorization tree nodes
			// factorize according to the tree of supports, level by level
			std::vector<MatDense<FPP, Cpu>> next_lvl_facs;
			std::vector<MatDense<FPP, Cpu>> input_facs;
			// first factorization
			next_lvl_facs.resize(2);
			if(meth == DTO)
				solveDTO(A, tree_supports[0][0], tree_supports[0][1], next_lvl_facs[0], next_lvl_facs[1]);
			else // meth == ButterflyFactDir
				lifting_two_layers_factorization(A, tree_supports[0][0], tree_supports[0][1], next_lvl_facs[0], next_lvl_facs[1]);
			input_facs = next_lvl_facs;

			for(size_t i=0;i<d-1;i++)
			{
				next_lvl_facs.resize(tree_supports[i+1].size());
				int j_bound = std::min(tree_supports[i].size(), tree_supports[i+1].size()/2);
				#pragma omp parallel for schedule(static)
				for(int j=0;j < j_bound;j++)
				{
					if(meth == DTO)
						solveDTO(input_facs[j], tree_supports[i+1][2*j], tree_supports[i+1][2*j+1], next_lvl_facs[j*2], next_lvl_facs[j*2+1]);
					else // meth == LIFTING
						lifting_two_layers_factorization(input_facs[j], tree_supports[i+1][2*j], tree_supports[i+1][2*j+1], next_lvl_facs[j*2], next_lvl_facs[j*2+1]);
				}
				if(i < d - 2)
				{
					input_facs = next_lvl_facs;
				}
			}
			// build the resulting Faust
			for(auto f: next_lvl_facs)
			{
				th->push_back(new MatSparse<FPP, Cpu>(f), false, false);
			}
			if(next_lvl_facs.size() < supports.size())
				for(int i=next_lvl_facs.size()/2;i<input_facs.size();i++)
				{
					th->push_back(new MatSparse<FPP, Cpu>(input_facs[i]), false, false);
				}
			return th;
		}

	template<typename FPP>
		Faust::TransformHelper<FPP, Cpu>* butterfly_hierarchical(const Faust::MatDense<FPP, Cpu>& A, const ButterflyFactDir &dir/*=RIGHT*/)
		{
			double log2_size = log2((double)A.getNbRow());
			if(A.getNbRow() != A.getNbCol())
				throw std::runtime_error("The matrix to factorize must be square.");
			if(log2_size - int(log2_size) > std::numeric_limits<Real<FPP>>::epsilon())
				throw std::runtime_error("The matrix to factorize must be of a size equal to a power of two.");
			auto support = butterfly_support<FPP>((int) log2_size);
			//	std::cout << "support norms" << std::endl;
			//	for(auto s: support)
			//		std::cout << s->norm() << std::endl;
			TransformHelper<FPP, Cpu>* th = nullptr;
			if(dir == BALANCED)
				th = butterfly_hierarchical_balanced(A, support);
			else
				th = butterfly_hierarchical(A, support, dir);
			for(auto s: support)
				delete s;
			return th;
		}

};