Commit 0abf67bc authored by MIJIEUX Thomas's avatar MIJIEUX Thomas

Instrument orthogonalization to measure flops

parent ebe4b169
......@@ -33,6 +33,7 @@ template<class> class Block;
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/svd.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -42,6 +43,8 @@ namespace fabulous {
template<class S>
class Block
{
private:
mutable int64_t _last_flops;
protected:
int _m;
int _n;
......@@ -80,7 +83,8 @@ public:
}
Block(int m, int n, int ld, S *ptr):
_m(m), _n(n), _ld(ld), _ptr(ptr)
_last_flops{0},
_m{m}, _n{n}, _ld{ld}, _ptr{ptr}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -93,9 +97,10 @@ public:
}
Block(int m, int n, int ld, S* ptr, SPtr sptr):
_m(m), _n(n), _ld(ld),
_sptr(sptr),
_ptr(ptr)
_last_flops{0},
_m{m}, _n{n}, _ld{ld},
_sptr{sptr},
_ptr{ptr}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -108,9 +113,10 @@ public:
}
Block(int m, int n, int ld, SPtr sptr):
_m(m), _n(n), _ld(ld),
_sptr(sptr),
_ptr(_sptr.get())
_last_flops{0},
_m{m}, _n{n}, _ld{ld},
_sptr{sptr},
_ptr{_sptr.get()}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -151,7 +157,7 @@ public:
{
const S *ptr = get_vect(k);
S res = S{0.0};
A.DotProduct(1, 1, ptr, _ld, ptr, _ld, &res, 1);
_last_flops = A.DotProduct(1, 1, ptr, _ld, ptr, _ld, &res, 1);
return std::sqrt(fabulous::real(res));
}
......@@ -162,6 +168,8 @@ public:
const S *ptr = get_vect(k);
S res = S{0.0};
lapacke::dot(_m, ptr, 1, ptr, 1, &res);
_last_flops = lapacke::dot_flops<S>(_m);
return std::sqrt(fabulous::real(res));
}
......@@ -424,6 +432,15 @@ public:
return nb_krylov_direction;
}
/**
* \brief return number of flops performed by last operation
*/
int64_t get_last_flops() const
{
return _last_flops;
}
}; // end class Block
template< class U >
......
......@@ -6,6 +6,7 @@ template<class> class BlockWP;
};
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/flops.hpp"
#include "fabulous/data/Block.hpp"
namespace fabulous {
......@@ -73,10 +74,11 @@ public:
*
* \param[out] C the C block of the projected matrix to be filled
*/
void compute_C(Block<S> &C)
int64_t compute_C(Block<S> &C)
{
int64_t nb_flops = 0;
if (!_cursor)
return;
return nb_flops;
lapacke::Tgemm( // C := P^{H} * \widetilde{W}
get_size_P(), get_size_W(), get_nb_row(),
......@@ -85,6 +87,8 @@ public:
C.get_ptr(), C.get_leading_dim(),
S{1.0}, S{0.0}
);
nb_flops += lapacke::Tgemm_flops<S>(get_size_P(), get_size_W(), get_nb_row());
lapacke::gemm( // ~W = ~W - P_j*C
get_nb_row(), get_size_W(), get_size_P(),
get_P_ptr(), get_leading_dim(),
......@@ -92,6 +96,8 @@ public:
get_W_ptr(), get_leading_dim(),
S{-1.0}, S{1.0}
);
nb_flops += lapacke::gemm_flops<S>(get_nb_row(), get_size_W(), get_size_P());
return nb_flops;
}
/**
......
......@@ -6,6 +6,7 @@
#include "fabulous/utils/Utils.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/kernel/qrf.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -26,7 +27,7 @@ namespace qr {
* \note R MUST BE already allocated ( Q.get_nb_col() x Q.get_nb_col() )
*/
template<class S, class Matrix>
void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
int64_t InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
{
int M = Q.get_nb_row();
int N = Q.get_nb_col();
......@@ -34,17 +35,21 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
assert( R.get_nb_col() == Q.get_nb_col() );
assert( R.get_nb_col() == R.get_nb_row() );
int64_t nb_flops = 0;
for (int j = 0; j < N; ++j) {
for (int i = 0; i < j; ++i) {
S dot; // dot = DOT(Q_i, Q_j)
A.DotProduct(1, 1, Q.get_vect(i), LD, Q.get_vect(j), LD, &dot, 1);
nb_flops += A.DotProduct(1, 1, Q.get_vect(i), LD,
Q.get_vect(j), LD, &dot, 1);
// Qj = Qj - dot(Qj,Qi)*Qi
lapacke::axpy(M, -dot, Q.get_vect(i), 1, Q.get_vect(j), 1);
nb_flops += lapacke::axpy_flops<S>(M);
R.at(i, j) = dot;
}
auto norm = Q.get_norm(j, A);
nb_flops += Q.get_last_flops();
R.at(j, j) = norm;
if (norm == 0.0) {
#ifndef FABULOUS_IB_EXACT_BREAKDOWN_CONTINUE
......@@ -55,8 +60,11 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
norm = 1.0;
#endif
}
lapacke::scal(M, S{1.0} / norm, Q.get_vect(j), 1); // Qj = (1/n)Qj
lapacke::scal(M, S{1.0} / norm, Q.get_vect(j), 1);
nb_flops += lapacke::scal_flops<S>(M);
// Qj = (1/n)Qj
}
return nb_flops;
}
/**
......@@ -74,7 +82,7 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
* or the user passed a block of size (0x0) and R will no be computed
*/
template<class S>
void InPlaceQRFacto(Block<S> &Q, Block<S> &R)
int64_t InPlaceQRFacto(Block<S> &Q, Block<S> &R)
{
int M = Q.get_nb_row();
int N = Q.get_nb_col();
......@@ -87,21 +95,24 @@ void InPlaceQRFacto(Block<S> &Q, Block<S> &R)
std::vector<S> tau;
lapacke::geqrf(M, N, Q.get_ptr(), LD, tau);
int64_t nb_flops = lapacke::geqrf_flops<S>(M, N);
if (R.get_nb_col() != 0) {
lapacke::lacpy( 'U', M, N, Q.get_ptr(), LD,
R.get_ptr(), R.get_leading_dim());
}
lapacke::orgqr(M, N, N, Q.get_ptr(), LD, tau.data());
nb_flops += lapacke::orgqr_flops<S>(M, N, N);
return nb_flops;
}
template<class S>
void OutOfPlaceQRFacto(Block<S> &A, Block<S> &Q, Block<S> &R)
int64_t OutOfPlaceQRFacto(Block<S> &A, Block<S> &Q, Block<S> &R)
{
assert( &Q != &A );
assert( Q.get_ptr() != A.get_ptr() );
Q.copy(A);
InPlaceQRFacto(Q, R);
return InPlaceQRFacto(Q, R);
}
}; // end namespace QR
......
......@@ -20,10 +20,7 @@ void Tgemm(int m, int n, int k,
S *C, int ldc,
S alpha = S{1.0}, S beta = S{0.0})
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
......@@ -33,37 +30,25 @@ void gemm(int m, int n, int k,
S *C, int ldc,
S alpha = S{1.0}, S beta = S{0.0})
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void scal(int N, S coef, S *X, int incX)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void axpy(int N, S coef, const S *X, int incX, S *Y, int incY)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void dot(int N, const S *X, int incX, const S *Y, int incY, S *ret)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
/**
......@@ -81,20 +66,14 @@ void dot(int N, const S *X, int incX, const S *Y, int incY, S *ret)
template<class S>
int trsm(int m, int nrhs, const S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int lacpy(char uplo, int m, int n, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
#ifndef FABULOUS_FLOPS_HPP
#define FABULOUS_FLOPS_HPP
#include <cstdint>
#include "fabulous/utils/Arithmetic.hpp"
namespace fabulous {
namespace lapacke {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
template<class S>
int64_t Tgemm_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t gemm_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t axpy_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t scal_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t dot_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t geqrf_flops(int m, int n)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t orgqr_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
#pragma GCC diagnostic pop
};
};
#endif // FABULOUS_FLOPS_HPP
......@@ -27,10 +27,7 @@ namespace lapacke {
template<class S>
int gels(int m, int n, int nrhs, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -37,10 +37,7 @@ int ggev(int n, S *A, int lda, S *B, int ldb,
S *vl, int ldvl ,
S *vr, int ldvr)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -10,7 +10,6 @@ namespace lapacke {
/******************* BASIC QR FACTORIZATION ************************/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
......@@ -25,10 +24,7 @@ namespace lapacke {
template<class S>
int geqrf(int m, int n, S *A, int lda, std::vector<S> &tau)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return 1;
}
......@@ -47,10 +43,7 @@ int geqrf(int m, int n, S *A, int lda, std::vector<S> &tau)
template<class S>
int orgqr(int m, int n, int p, S *A, int lda, S *tau)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......@@ -78,10 +71,7 @@ template<class S>
int ormqr(char trans, int m, int n, int k,
const S *A, int lda, const S *tau, S *C, int ldc)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -31,10 +31,7 @@ namespace lapacke {
template< class S >
int tpqrt(int m, int n, int l, int nb, S *A, int lda, S *B, int ldb, Block<S> &T)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......@@ -66,10 +63,7 @@ int tpmqrt(char trans, int m, int n, int k, int l, int nb,
const Block<S> &T, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -16,7 +16,7 @@ namespace fabulous {
/**
* \brief Orthogonalization methods for BLOCK variants WITHOUT Inexact Breakdown
*/
template< class HESS >
template<class HESS>
class OrthogonalizerBlockSTD : public OrthogonalizerBlockCOMMON
{
private:
......@@ -35,16 +35,17 @@ private:
* \param[in,out] W block to be orthogonalized against current base
* \param ortho_scheme the orthogonalization scheme,
*/
template< class S >
void run(HESS &hess, Base<S> &base, Block<S> &W)
template<class S>
int64_t run(HESS &hess, Base<S> &base, Block<S> &W)
{
hess.increase(W.get_nb_col());
Block<S> H = hess.get_H();
Block<S> R = hess.get_R();
dispatch(base, H, W);
qr::InPlaceQRFacto(W, R);
int64_t nb_flops = dispatch(base, H, W);
nb_flops += qr::InPlaceQRFacto(W, R);
return nb_flops;
}
};
......
......@@ -16,7 +16,7 @@ namespace fabulous {
/**
* \brief Orthogonalization methods for BLOCK variants WITH Inexact Breakdown
*/
template< class HESS >
template<class HESS>
class OrthogonalizerBlockIB : public OrthogonalizerBlockCOMMON
{
private:
......@@ -36,7 +36,7 @@ private:
* \param ortho_scheme the orthogonalization scheme,
*/
template<class S>
void run(HESS &L, Base<S> &base, BlockWP<S> &WP)
int64_t run(HESS &L, Base<S> &base, BlockWP<S> &WP)
{
L.increase(WP.get_size_W());
......@@ -45,9 +45,10 @@ private:
Block<S> C = L.get_C(); // hessenberg's parts
Block<S> W = WP.get_W(); // candidate part
dispatch(base, H, W);
WP.compute_C(C); // Orthogonalisation of (candidate) W against P
qr::InPlaceQRFacto(W, D);
int64_t nb_flops = dispatch(base, H, W);
nb_flops += WP.compute_C(C); // Orthogonalisation of (candidate) W against P
nb_flops += qr::InPlaceQRFacto(W, D);
return nb_flops;
}
}; // end class OrthogonalizerBlockIB
......
......@@ -10,6 +10,8 @@ class OrthogonalizerBlockCOMMON;
#include "fabulous/data/Base.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/orthogonalization/OrthoParam.hpp"
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -18,9 +20,12 @@ namespace fabulous {
*/
class OrthogonalizerBlockCOMMON : public OrthoParam
{
private:
int64_t _nb_flops;
protected:
OrthogonalizerBlockCOMMON(const OrthoParam &param):
OrthoParam{param}
OrthoParam{param},
_nb_flops{0}
{
}
......@@ -49,6 +54,7 @@ protected:
H.get_ptr(), H.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(nb_vect, W_size, dim);
lapacke::gemm( // W = W - Vm*H
dim, W_size, nb_vect,
......@@ -57,6 +63,7 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, nb_vect);
}
/**
......@@ -86,6 +93,7 @@ protected:
tmp.get_ptr(), tmp.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(nb_vect, W_size, dim);
lapacke::gemm( // W = W - Vm*H
dim, W_size, nb_vect,
......@@ -94,10 +102,13 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, nb_vect);
// Add tmp to hess
for (int j = 0; j < tmp.get_nb_col(); ++j) // H = H + tmp;
for (int j = 0; j < tmp.get_nb_col(); ++j) { // H = H + tmp;
lapacke::axpy(nb_vect, S{1.0}, tmp.get_vect(j), 1, H.get_vect(j), 1);
_nb_flops += lapacke::axpy_flops<S>(nb_vect);
}
}
/**
......@@ -131,6 +142,7 @@ protected:
H_k.get_ptr(), H_k.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(size_block, W_size, dim);
lapacke::gemm( // W = W - V_i * H_{ij}
dim, W_size, size_block,
......@@ -139,6 +151,7 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, size_block);
}
}
......@@ -177,6 +190,7 @@ protected:
tmp.get_ptr(), tmp.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(size_block, W_size, dim);
lapacke::gemm( // W = W - V_i * tmp
dim, W_size, size_block,
......@@ -185,14 +199,17 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, size_block);
Block<S> H_k = H.sub_block(size_block_sum, 0, size_block, W_size);
size_block_sum += size_block;
// Add tmp to hess
for (int j = 0; j < tmp.get_nb_col(); ++j) // H_k = H_k + tmp
for (int j = 0; j < tmp.get_nb_col(); ++j) { // H_k = H_k + tmp
lapacke::axpy(
size_block, S{1.0}, tmp.get_vect(j), 1, H_k.get_vect(j), 1);
_nb_flops += lapacke::axpy_flops<S>(size_block);
}
}
}
......@@ -204,8 +221,10 @@ protected:
* \param[out] H block to write the orthogonalization coeffs
*/
template<class S>
void dispatch(Base<S> &base, Block<S> &H, Block<S> &W)
int64_t dispatch(Base<S> &base, Block<S> &H, Block<S> &W)
{
_nb_flops = 0;
switch (_scheme) {
case OrthoScheme::CGS: CGS(base, H, W); break;
case OrthoScheme::MGS: MGS(base, H, W); break;
......@@ -219,6 +238,7 @@ protected:
break;
default: FABULOUS_FATAL_ERROR("Invalid orthogonalization scheme."); break;
}
return _nb_flops;
}
}; // end class OrthogonalizerBlockCOMMON
......
......@@ -9,7 +9,8 @@ template<class HESS> class OrthogonalizerRuheSTD;
#include "fabulous/data/Base.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/orthogonalization/OrthoParam.hpp"
#include "fabulous/orthogonalization/OrthoParam.hpp"
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -22,10 +23,12 @@ template<class HESS>
class OrthogonalizerRuheSTD : public OrthoParam
{
private:
int64_t _nb_flops;
friend class Orthogonalizer;
OrthogonalizerRuheSTD(const OrthoParam &param):
OrthoParam{param}
OrthoParam{param},
_nb_flops{0}
{
}
......@@ -40,7 +43,6 @@ private:
int W_size = W.get_nb_col();
int dim = base.get_nb_row();
int nb_vect_in_base = base.get_nb_vect();
int block_size = base.get_block_size(base.get_nb_block()-1);
FABULOUS_ASSERT(W_size == block_size);
......@@ -48,8 +50,7 @@ private:
int ldr = R.get_leading_dim();
int ldw = W.get_leading_dim();
int ldv = base.get_leading_dim();
auto *V = base.get_ptr();
S *V = base.get_ptr();
// Loop over vector in W block
for (int k = 0; k < W_size; ++k) {
......@@ -58,7 +59,8 @@ private:
S *R_k = R.get_vect(k);
{ // Ortho against Base
A.DotProduct(nb_vect_in_base, 1, V, ldv, W_k, ldw, H_k, ldh);
_nb_flops += A.DotProduct(nb_vect_in_base, 1, V, ldv,
W_k, ldw, H_k, ldh);
lapacke::gemm( // W_k = W_k - V * H_k
dim, 1, nb_vect_in_base,
V, ldv,
......@@ -66,10 +68,12 @@ private:
W_k, ldw,
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, 1, nb_vect_in_base);
}
{ // Ortho against already processed W vectors
A.DotProduct(k, 1, W.get_ptr(), ldw, W_k, ldw, R_k, ldr);
_nb_flops += A.DotProduct(k, 1, W.get_ptr(), ldw,
W_k, ldw, R_k, ldr);
lapacke::gemm( // W_k = W_k - W_{0->k-1} * R_k
dim, 1, k,
W.get_ptr(), ldw,
......@@ -77,21 +81,24 @@ private:
W_k, ldw,
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, 1, k);
}
auto n = W.get_norm(k, A);
_nb_flops += W.get_last_flops();
if (n == 0.0) {
FABULOUS_FATAL_ERROR(
"Rank loss in block candidate for extending the Krylov Space" );
}
R_k[k] = n; // Hess[k+1,k] = norm(wj_k)
lapacke::scal(dim, S{1.0} / n, W_k, 1); // W[k] /= norm(W[k])
_nb_flops += lapacke::scal_flops<S>(dim);
}
}
/**
* \brief Arnoldi Ruhe version with CGS ortho
*/
template< class Matrix, class S >
template<class Matrix, class S>
void ICGS( Matrix &A, Base<S> &base,
Block<S> &H, Block<S> &R, // hessenberg
Block<S> &W) // candidate
......@@ -104,7 +111,7 @@ private:
int ldw = W.get_leading_dim();
int ldv = base.get_leading_dim();
auto *V = base.get_ptr();
S *V = base.get_ptr();