Implement GPU2 spgemm prototypes as on CPU and use them to implement gemm_gen...

Implement GPU2 spgemm prototypes as on CPU and use them to implement gemm_gen for GPU2 + update to gpu_mod@0f2c75e4.

Implement GPU2 spgemm prototypes as on CPU and use them to implement gemm_gen...
c477df16 · hhakim · 3ed7504a · 0f2c75e4 · 48c6b03a · 0f2c75e4
Commit c477df16 authored 3 years ago by hhakim
--- a/gpu_mod @ 0f2c75e4
+++ b/gpu_mod @ 0f2c75e4
-Subproject commit 48c6b03a5b5e17b69ca21e4bc6c300f53e5e10f5
+Subproject commit 0f2c75e43083e6bacc0fae9e1ce5ce23873b75ed
--- a/src/faust_linear_operator/CPU/faust_linear_algebra.hpp
+++ b/src/faust_linear_operator/CPU/faust_linear_algebra.hpp
@@ -262,9 +262,9 @@ template<typename FPP>
 void Faust::spgemm(const Faust::MatSparse<FPP,Cpu> & A,const Faust::MatDense<FPP,Cpu> & B, Faust::MatDense<FPP,Cpu> & C,const FPP & alpha, const FPP & beta, char  typeA, char  typeB)
 {
 	//TODO: refactoring should be done to avoid repeating similar block of code for different cases (typeA,typeB,alpha,beta)
-//#ifdef __COMPILE_TIMERS__
-//	A.t_gemm.start();
-//#endif
+	//#ifdef __COMPILE_TIMERS__
+	//	A.t_gemm.start();
+	//#endif
 	faust_unsigned_int nbRowOpA,nbRowOpB,nbColOpA,nbColOpB;

 	if (((&(C.mat)) == (&(B.mat))))
@@ -309,7 +309,7 @@ void Faust::spgemm(const Faust::MatSparse<FPP,Cpu> & A,const Faust::MatDense<FPP
 		handleError("linear_algebra", "Faust::spgemm : invalid dimension for output matrix C");
 	}

-        C.resize(nbRowOpA,nbColOpB);
+	C.resize(nbRowOpA,nbColOpB);



@@ -453,9 +453,9 @@ void Faust::spgemm(const Faust::MatSparse<FPP,Cpu> & A,const Faust::MatDense<FPP
 	}
 	C.isZeros = false;
 	C.set_id(false);
-//#ifdef __COMPILE_TIMERS__
-//A.t_gemm.stop();
-//#endif
+	//#ifdef __COMPILE_TIMERS__
+	//A.t_gemm.stop();
+	//#endif
 }

 template<typename FPP>
@@ -905,6 +905,7 @@ namespace Faust
 {
 	template<typename FPP> void gemm_gen(const MatGeneric<FPP, Cpu>& A, const MatGeneric<FPP, Cpu>& B, MatDense<FPP, Cpu>& out, const FPP alpha/*=FPP(1.0)*/, const FPP beta/*=(0.0)*/, const char opA/*='N'*/, const char opB/*='N'*/)
 	{
+		//TODO: refactor this function with at least 3 new functions gemm_spA, gemm_dsA, gemm_bsrA
 		std::runtime_error type_err("faust_linear_algebra mul function doesn't handle other type of factors than MatDense, MatSparse or MatBSR.");
 		if(opA != 'N' && opA != 'T' && opA != 'H')
 			throw std::runtime_error("opA must be among 'N', 'T', 'H'");

--- a/src/faust_linear_operator/GPU2/faust_MatSparse_gpu.cpp.in
+++ b/src/faust_linear_operator/GPU2/faust_MatSparse_gpu.cpp.in
@@ -485,6 +485,22 @@ namespace Faust
 			mat.gpu_mat = gpu_dmat;
 		}

+
+	template<>
+	void MatSparse<FSFG, GPU2>::spgemm(const MatSparse<FSFG,GPU2> & A, const MatDense<FSFG,GPU2> & B, MatDense<FSFG,GPU2> & C, const FSFG & alpha, const FSFG & beta, char opA, char opB)
+	{
+		gm_Op gop_A;
+		gm_Op gop_B;
+		char2gm_Op(opA, gop_A);
+		char2gm_Op(opB, gop_B);
+		auto spm_funcs = GPUModHandler::get_singleton()->spm_funcs((@FAUST_SCALAR_FOR_GM@)(0));
+		auto dsm_funcs = GPUModHandler::get_singleton()->dsm_funcs((@FAUST_SCALAR_FOR_GM@)(0));
+		spm_funcs->gemm(A.gpu_mat, B.gpu_mat, C.gpu_mat,
+				(const @GM_SCALAR@*) reinterpret_cast<const @GM_REINTERPRET_CAST_SCALAR@*>(&alpha),
+				(const @GM_SCALAR@*) reinterpret_cast<const @GM_REINTERPRET_CAST_SCALAR@*>(&beta),
+				gop_A, gop_B);
+	}
+
 	template<>
 		size_t Faust::MatSparse<FSFG,GPU2>::getNBytes() const
 		{

--- a/src/faust_linear_operator/GPU2/faust_MatSparse_gpu.h
+++ b/src/faust_linear_operator/GPU2/faust_MatSparse_gpu.h
@@ -101,6 +101,7 @@ namespace Faust
 				MatType getType() const;
 				void multiply(Vect<FPP,GPU2>& vec, char opThis='N') const;
 				void multiply(MatDense<FPP,GPU2>& vec, char opThis='N') const;
+				static void spgemm(const MatSparse<FPP,GPU2> & A, const MatDense<FPP,GPU2> & B, MatDense<FPP,GPU2> & C, const FPP & alpha, const FPP & beta, const char opA, const char opB);
 				MatBSR<FPP, GPU2> to_bsr(int bsize) const;
 				~MatSparse();


--- a/src/faust_linear_operator/GPU2/faust_linear_algebra_gpu.h
+++ b/src/faust_linear_operator/GPU2/faust_linear_algebra_gpu.h
@@ -12,20 +12,22 @@ namespace Faust
 	template<typename FPP>
 		void gemm(const MatDense<FPP,GPU2> & A,const MatDense<FPP,GPU2> & B, MatDense<FPP,GPU2> & C,const FPP  alpha, const FPP  beta, char  opA, char  opB);

-	// Computes alpha*opA(A)*b+ beta*c into c.
+	// Computes alpha*opA(A)*b+ beta*c into C.
 	template<typename FPP>
-		void gemv(const MatDense<FPP, GPU2> &A, const Vect<FPP, GPU2> &b, Vect<FPP, GPU2> &c, const FPP& alpha, const FPP& beta, const char opA);
+		void gemv(const MatDense<FPP, GPU2> &A, const Vect<FPP, GPU2> &b, Vect<FPP, GPU2> &C, const FPP& alpha, const FPP& beta, const char opA);

 	// Computes alpha*opA(A)*opB(B)+ beta*C into C.
 	template<typename FPP>
 		void gemm_gen(const MatGeneric<FPP,GPU2> & A, const MatGeneric<FPP,GPU2> & B, MatDense<FPP,GPU2> & C, const FPP  alpha, const FPP  beta, char  opA, char  opB);

-	//	TODO: implements using MatSparse::multiply, warning: 'H' is not supported for opB (see gpu_mod / https://docs.nvidia.com/cuda/archive/9.2/cusparse/index.html cusparseTcsrmm2 for more details), so do a copy-conjugate manually beforehand
-//	template<typename FPP>
-//		void spgemm(const MatSparse<FPP,Cpu> & A,const MatDense<FPP,Cpu> & B, MatDense<FPP,Cpu> & C,const FPP & alpha, const FPP & beta, char opA, char opB);
-//
-//	template<typename FPP>
-//		void spgemm(const MatDense<FPP,Cpu> & A,const MatSparse<FPP,Cpu> & B, MatDense<FPP,Cpu> & C,const FPP & alpha, const FPP & beta, char opA, char opB);
+	// Computes alpha*opA(A)*opB(B)+ beta*C into C.
+	template<typename FPP>
+		void spgemm(const MatSparse<FPP,GPU2> & A,const MatDense<FPP,GPU2> & B, MatDense<FPP,GPU2> & C,const FPP & alpha, const FPP & beta, char opA, char opB);
+
+	// Computes alpha*opA(A)*opB(B)+ beta*C into C.
+	// \param impl_meth: in any case this function rely on previous spgemm prototype, if impl_meth is 1 then transpose/transconjugate is used to avoid converting A and B to another type of matrix, otherwise (impl_meth is any other value) A is converted to a MatSparse and B to a MatDense
+	template<typename FPP>
+		void spgemm(const MatDense<FPP,GPU2> & A,const MatSparse<FPP,GPU2> & B, MatDense<FPP,GPU2> & C,const FPP & alpha, const FPP & beta, char opA, char opB, int impl_meth = 1);

 }
 #include "faust_linear_algebra_gpu.hpp"

--- a/src/faust_linear_operator/GPU2/faust_linear_algebra_gpu.hpp
+++ b/src/faust_linear_operator/GPU2/faust_linear_algebra_gpu.hpp
@@ -13,6 +13,95 @@ namespace Faust
 			MatDense<FPP, GPU2>::gemv(A, b, c, alpha, beta);
 		}

+	template<typename FPP>
+		void spgemm(const MatSparse<FPP,GPU2> & A,const MatDense<FPP,GPU2> & B, MatDense<FPP,GPU2> & C,const FPP & alpha, const FPP & beta, char opA, char opB)
+		{
+			MatSparse<FPP, GPU2>::spgemm(A, B, C, alpha, beta, opA, opB);
+		}
+
+	template<typename FPP>
+		void spgemm(const MatDense<FPP,GPU2> & A, const MatSparse<FPP,GPU2> & B, MatDense<FPP,GPU2> & C, const FPP & alpha, const FPP & beta, char opA, char opB, int impl_meth/* = 1*/)
+		{
+			//TODO: benchmark the two methods (impl_meth == 1 and 2)
+			if (impl_meth == 1)
+			{
+				// transpose / adjoint the product to rely on other signature of spgemm (MatSparse B as lhs matrix -- i.e. A)
+				char nopA, nopB;
+				MatDense<FPP, GPU2> nA(A);
+				MatSparse<FPP, GPU2> nB(B);
+				if(opA == 'N' && opB == 'N')
+				{
+					nopA = 'T';
+					nopB = 'T';
+					C.resize(nB.getNbCol(), nA.getNbRow());
+					spgemm(nB, nA, C, alpha, beta, nopB, nopA);
+					C.transpose();
+				}
+				else if(opA == 'N' && opB == 'T')
+				{
+					nopA = 'T';
+					C.resize(nB.getNbRow(), nA.getNbRow());
+					spgemm(nB, nA, C, alpha, beta, opB, nopA);
+					C.transpose();
+				}
+				else if(opA == 'T' && opB == 'N')
+				{
+					nopB = 'T';
+					C.resize(nB.getNbCol(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, nopB, opA);
+					C.transpose();
+				}
+				else if(opA == 'T' && opB == 'T')
+				{
+					C.resize(nB.getNbRow(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, opB, opA);
+					C.transpose();
+				}
+				else if(opA == 'N' && opB == 'H')
+				{
+					nopA = 'H';
+					C.resize(nB.getNbRow(), nA.getNbRow());
+					spgemm(nB, nA, C, alpha, beta, opB, nopA);
+					C.adjoint();
+				}
+				else if(opA == 'H' && opB == 'N')
+				{
+					nopB = 'H';
+					C.resize(nB.getNbCol(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, nopB, opA);
+					C.adjoint();
+				}
+				else if(opA == 'H' && opB == 'H')
+				{
+					C.resize(nB.getNbRow(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, opB, opA);
+					C.adjoint();
+				}
+				else if(opA == 'H' && opB == 'T')
+				{
+					nopA = 'N';
+					nB.conjugate();
+					nopB = 'N';
+					C.resize(nB.getNbRow(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, nopB, nopA);
+					C.adjoint();
+				}
+				else if(opA == 'T' && opB == 'H')
+				{
+					nA.conjugate();
+					nopA = 'N';
+					nopB = 'N';
+					C.resize(nB.getNbRow(), nA.getNbCol());
+					spgemm(nB, nA, C, alpha, beta, nopB, nopA);
+					C.adjoint();
+				}
+			}
+			else {
+				spgemm(MatSparse<FPP, GPU2>(A), MatDense<FPP, GPU2>(B), C, alpha, beta, opA, opB);
+			}
+		}
+
+
 	template<typename FPP>
 		void gemm_gen(const MatGeneric<FPP,GPU2> & A, const MatGeneric<FPP,GPU2> & B, MatDense<FPP,GPU2> & C, const FPP  alpha, const FPP  beta, char  typeA, char  typeB)
 		{
@@ -20,7 +109,7 @@ namespace Faust
 			const MatSparse<FPP, GPU2>* spB;
 			const MatDense<FPP, GPU2>* dsA;
 			const MatDense<FPP, GPU2>* dsB;
-			// downcast an call the proper function
+			// downcast and call the proper function
 			spA = dynamic_cast<const Faust::MatSparse<FPP,GPU2>*>(&A);
 			if(! spA)
 				dsA = dynamic_cast<const Faust::MatDense<FPP,GPU2>*>(&A);
@@ -30,11 +119,9 @@ namespace Faust
 			if(spA && spB)
 				throw std::runtime_error("gemm on two MatSparse is not supported.");
 			else if(spA)
-				throw std::runtime_error("spgemm is not supported yet on GPU2."); //TODO: after spgemm impl.
-//				spgemm(*spA, *dsB, C, alpha, beta, typeA, typeB);
+				spgemm(*spA, *dsB, C, alpha, beta, typeA, typeB);
 			else if(spB)
-				throw std::runtime_error("spgemm is not supported yet on GPU2."); //TODO: after spgemm impl.
-//				spgemm(*dsA, *spB, C, alpha, beta, typeA, typeB);
+				spgemm(*dsA, *spB, C, alpha, beta, typeA, typeB);
 			else
 				gemm(*dsA, *dsB, C, alpha, beta, typeA, typeB);
 		}