From 7eaa05a3cbf26f67f622059a66cd9f11c6d15a4a Mon Sep 17 00:00:00 2001 From: Matthias Messner <matthias.messner@inria.fr> Date: Tue, 3 Jul 2012 15:03:16 -0700 Subject: [PATCH] some optimizations in pACA and logging of timings in M2L kernel --- Src/Kernels/Chebyshev/FChebSymKernel.hpp | 46 +++++++++++++++++++- Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp | 21 +++++---- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/Src/Kernels/Chebyshev/FChebSymKernel.hpp b/Src/Kernels/Chebyshev/FChebSymKernel.hpp index 5cfaa116d..d8c07780b 100644 --- a/Src/Kernels/Chebyshev/FChebSymKernel.hpp +++ b/Src/Kernels/Chebyshev/FChebSymKernel.hpp @@ -13,9 +13,11 @@ class FTreeCoordinate; -/// for verbosity only!!! +// for verbosity only!!! //#define COUNT_BLOCKED_INTERACTIONS +// if timings should be logged +//#define LOG_TIMINGS /** * @author Matthias Messner(matthias.messner@inria.fr) @@ -79,6 +81,10 @@ class FChebSymKernel } +#ifdef LOG_TIMINGS + FTic time; + FReal t_m2l_1, t_m2l_2, t_m2l_3; +#endif public: /** @@ -95,6 +101,12 @@ public: Loc(NULL), Mul(NULL), countExp(NULL) { this->allocateMemoryForPermutedExpansions(); + +#ifdef LOG_TIMINGS + t_m2l_1 = FReal(0.); + t_m2l_2 = FReal(0.); + t_m2l_3 = FReal(0.); +#endif } @@ -120,6 +132,13 @@ public: if (Loc!=NULL) delete [] Loc; if (Mul!=NULL) delete [] Mul; if (countExp!=NULL) delete [] countExp; + +#ifdef LOG_TIMINGS + std::cout << "- Permutation took " << t_m2l_1 << "s" + << "\n- GEMMT and GEMM took " << t_m2l_2 << "s" + << "\n- Unpermutation took " << t_m2l_3 << "s" + << std::endl; +#endif } @@ -161,6 +180,10 @@ public: const int NumSourceCells, const int TreeLevel) { +#ifdef LOG_TIMINGS + time.tic(); +#endif + // permute and copy multipole expansion memset(countExp, 0, sizeof(int) * 343); for (unsigned int idx=0; idx<343; ++idx) { @@ -190,6 +213,10 @@ public: } } +#ifdef LOG_TIMINGS + t_m2l_1 += time.tacAndElapsed(); +#endif + #ifdef COUNT_BLOCKED_INTERACTIONS //////////////////////////////////// unsigned int count_lidx = 0; @@ -207,6 +234,10 @@ public: #endif /////////////////////////////////////////////////////////////// +#ifdef LOG_TIMINGS + time.tic(); +#endif + // multiply (mat-mat-mul) FReal Compressed [nnodes * 24]; const FReal scale = AbstractBaseClass::MatrixKernel->getScaleFactor(AbstractBaseClass::BoxWidth, TreeLevel); @@ -225,6 +256,14 @@ public: } } +#ifdef LOG_TIMINGS + t_m2l_2 += time.tacAndElapsed(); +#endif + +#ifdef LOG_TIMINGS + time.tic(); +#endif + // permute and add contribution to local expansions FReal *const LocalExpansion = TargetCell->getLocal(); memset(countExp, 0, sizeof(int) * 343); @@ -254,6 +293,11 @@ public: } } +#ifdef LOG_TIMINGS + t_m2l_3 += time.tacAndElapsed(); +#endif + + } diff --git a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp index cd7635a4e..bdda4f068 100644 --- a/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp +++ b/Src/Kernels/Chebyshev/FChebSymM2LHandler.hpp @@ -158,6 +158,7 @@ void pACA(const ComputerType& Computer, // initialize rank k and UV' k = 0; + const FReal eps2 = eps * eps; const int maxk = (nx + ny) / 2; U = new FReal[nx * maxk]; V = new FReal[ny * maxk]; @@ -186,12 +187,14 @@ void pACA(const ComputerType& Computer, // find max of residual and argmax FReal maxval = 0.; - for (unsigned int j=0; j<ny; ++j) - if (c[j] && maxval < FMath::Abs(colV[j])) { - maxval = FMath::Abs(colV[j]); + for (unsigned int j=0; j<ny; ++j) { + const FReal abs_val = FMath::Abs(colV[j]); + if (c[j] && maxval < abs_val) { + maxval = abs_val; J = j; } - // scale pivot v + } + // find pivot and scale column of V const FReal pivot = FReal(1.) / colV[J]; FBlas::scal(ny, pivot, colV); @@ -207,11 +210,13 @@ void pACA(const ComputerType& Computer, // find max of residual and argmax maxval = 0.; - for (unsigned int i=0; i<nx; ++i) - if (r[i] && maxval < FMath::Abs(colU[i])) { - maxval = FMath::Abs(colU[i]); + for (unsigned int i=0; i<nx; ++i) { + const FReal abs_val = FMath::Abs(colU[i]); + if (r[i] && maxval < abs_val) { + maxval = abs_val; I = i; } + } //////////////////////////////////////////// // increment Frobenius norm: |Sk|^2 += |uk|^2 |vk|^2 + 2 sumj ukuj vjvk @@ -225,7 +230,7 @@ void pACA(const ComputerType& Computer, // increment low-rank k++; - } while (norm2uv > eps*eps * norm2S); + } while (norm2uv > eps2 * norm2S); delete [] r; delete [] c; -- GitLab