From 7eaa05a3cbf26f67f622059a66cd9f11c6d15a4a Mon Sep 17 00:00:00 2001
From: Matthias Messner <matthias.messner@inria.fr>
Date: Tue, 3 Jul 2012 15:03:16 -0700
Subject: [PATCH] some optimizations in pACA and logging of timings in M2L
 kernel

---
 Src/Kernels/Chebyshev/FChebSymKernel.hpp     | 46 +++++++++++++++++++-
 Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp | 21 +++++----
 2 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/Src/Kernels/Chebyshev/FChebSymKernel.hpp b/Src/Kernels/Chebyshev/FChebSymKernel.hpp
index 5cfaa116d..d8c07780b 100644
--- a/Src/Kernels/Chebyshev/FChebSymKernel.hpp
+++ b/Src/Kernels/Chebyshev/FChebSymKernel.hpp
@@ -13,9 +13,11 @@
 class FTreeCoordinate;
 
 
-/// for verbosity only!!!
+// for verbosity only!!!
 //#define COUNT_BLOCKED_INTERACTIONS
 
+// if timings should be logged
+//#define LOG_TIMINGS
 
 /**
  * @author Matthias Messner(matthias.messner@inria.fr)
@@ -79,6 +81,10 @@ class FChebSymKernel
 	}
 	
 
+#ifdef LOG_TIMINGS
+	FTic time;
+	FReal t_m2l_1, t_m2l_2, t_m2l_3;
+#endif
 
 public:
 	/**
@@ -95,6 +101,12 @@ public:
 			Loc(NULL), Mul(NULL), countExp(NULL)
 	{
 		this->allocateMemoryForPermutedExpansions();
+
+#ifdef LOG_TIMINGS
+		t_m2l_1 = FReal(0.);
+		t_m2l_2 = FReal(0.);
+		t_m2l_3 = FReal(0.);
+#endif
 	}
 	
 	
@@ -120,6 +132,13 @@ public:
 		if (Loc!=NULL)      delete [] Loc;
 		if (Mul!=NULL)      delete [] Mul;
 		if (countExp!=NULL) delete [] countExp;
+
+#ifdef LOG_TIMINGS
+		std::cout << "- Permutation took " << t_m2l_1 << "s"
+							<< "\n- GEMMT and GEMM took " << t_m2l_2 << "s"
+							<< "\n- Unpermutation took " << t_m2l_3 << "s"
+							<< std::endl;
+#endif
 	}
 
 
@@ -161,6 +180,10 @@ public:
 					 const int NumSourceCells,
 					 const int TreeLevel)
 	{
+#ifdef LOG_TIMINGS
+		time.tic();
+#endif
+
 		// permute and copy multipole expansion
 		memset(countExp, 0, sizeof(int) * 343);
 		for (unsigned int idx=0; idx<343; ++idx) {
@@ -190,6 +213,10 @@ public:
 			}
 		}
 
+#ifdef LOG_TIMINGS
+		t_m2l_1 += time.tacAndElapsed();
+#endif
+
 
 #ifdef COUNT_BLOCKED_INTERACTIONS ////////////////////////////////////
 		unsigned int count_lidx = 0;
@@ -207,6 +234,10 @@ public:
 #endif ///////////////////////////////////////////////////////////////
 
 
+#ifdef LOG_TIMINGS
+		time.tic();
+#endif
+
 		// multiply (mat-mat-mul)
 		FReal Compressed [nnodes * 24];
 		const FReal scale = AbstractBaseClass::MatrixKernel->getScaleFactor(AbstractBaseClass::BoxWidth, TreeLevel);
@@ -225,6 +256,14 @@ public:
 			}
 		}
 
+#ifdef LOG_TIMINGS
+		t_m2l_2 += time.tacAndElapsed();
+#endif
+
+#ifdef LOG_TIMINGS
+		time.tic();
+#endif
+
 		// permute and add contribution to local expansions
 		FReal *const LocalExpansion = TargetCell->getLocal();
 		memset(countExp, 0, sizeof(int) * 343);
@@ -254,6 +293,11 @@ public:
 			}
 		}
 
+#ifdef LOG_TIMINGS
+		t_m2l_3 += time.tacAndElapsed();
+#endif
+
+
 	}
 
 
diff --git a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
index cd7635a4e..bdda4f068 100644
--- a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
+++ b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp
@@ -158,6 +158,7 @@ void pACA(const ComputerType& Computer,
 	
 	// initialize rank k and UV'
 	k = 0;
+	const FReal eps2 = eps * eps;
 	const int maxk = (nx + ny) / 2;
 	U = new FReal[nx * maxk];
 	V = new FReal[ny * maxk];
@@ -186,12 +187,14 @@ void pACA(const ComputerType& Computer,
 		
 		// find max of residual and argmax
 		FReal maxval = 0.;
-		for (unsigned int j=0; j<ny; ++j)
-			if (c[j] && maxval < FMath::Abs(colV[j])) {
-				maxval = FMath::Abs(colV[j]);
+		for (unsigned int j=0; j<ny; ++j) {
+			const FReal abs_val = FMath::Abs(colV[j]);
+			if (c[j] && maxval < abs_val) {
+				maxval = abs_val;
 				J = j;
 			}
-		// scale pivot v
+		}
+		// find pivot and scale column of V
 		const FReal pivot = FReal(1.) / colV[J];
 		FBlas::scal(ny, pivot, colV);
 		
@@ -207,11 +210,13 @@ void pACA(const ComputerType& Computer,
 		
 		// find max of residual and argmax
 		maxval = 0.;
-		for (unsigned int i=0; i<nx; ++i)
-			if (r[i] && maxval < FMath::Abs(colU[i])) {
-				maxval = FMath::Abs(colU[i]);
+		for (unsigned int i=0; i<nx; ++i) {
+			const FReal abs_val = FMath::Abs(colU[i]);
+			if (r[i] && maxval < abs_val) {
+				maxval = abs_val;
 				I = i;
 			}
+		}
 		
 		////////////////////////////////////////////
 			// increment Frobenius norm: |Sk|^2 += |uk|^2 |vk|^2 + 2 sumj ukuj vjvk
@@ -225,7 +230,7 @@ void pACA(const ComputerType& Computer,
 		// increment low-rank
 		k++;
 
-	} while (norm2uv > eps*eps * norm2S);
+	} while (norm2uv > eps2 * norm2S);
 	
 	delete [] r;
 	delete [] c;
-- 
GitLab