Commit 0abf67bc authored by MIJIEUX Thomas's avatar MIJIEUX Thomas

Instrument orthogonalization to measure flops

parent ebe4b169
......@@ -33,6 +33,7 @@ template<class> class Block;
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/svd.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -42,6 +43,8 @@ namespace fabulous {
template<class S>
class Block
{
private:
mutable int64_t _last_flops;
protected:
int _m;
int _n;
......@@ -80,7 +83,8 @@ public:
}
Block(int m, int n, int ld, S *ptr):
_m(m), _n(n), _ld(ld), _ptr(ptr)
_last_flops{0},
_m{m}, _n{n}, _ld{ld}, _ptr{ptr}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -93,9 +97,10 @@ public:
}
Block(int m, int n, int ld, S* ptr, SPtr sptr):
_m(m), _n(n), _ld(ld),
_sptr(sptr),
_ptr(ptr)
_last_flops{0},
_m{m}, _n{n}, _ld{ld},
_sptr{sptr},
_ptr{ptr}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -108,9 +113,10 @@ public:
}
Block(int m, int n, int ld, SPtr sptr):
_m(m), _n(n), _ld(ld),
_sptr(sptr),
_ptr(_sptr.get())
_last_flops{0},
_m{m}, _n{n}, _ld{ld},
_sptr{sptr},
_ptr{_sptr.get()}
{
assert( ld >= m );
assert( ld > 0 );
......@@ -151,7 +157,7 @@ public:
{
const S *ptr = get_vect(k);
S res = S{0.0};
A.DotProduct(1, 1, ptr, _ld, ptr, _ld, &res, 1);
_last_flops = A.DotProduct(1, 1, ptr, _ld, ptr, _ld, &res, 1);
return std::sqrt(fabulous::real(res));
}
......@@ -162,6 +168,8 @@ public:
const S *ptr = get_vect(k);
S res = S{0.0};
lapacke::dot(_m, ptr, 1, ptr, 1, &res);
_last_flops = lapacke::dot_flops<S>(_m);
return std::sqrt(fabulous::real(res));
}
......@@ -424,6 +432,15 @@ public:
return nb_krylov_direction;
}
/**
* \brief return number of flops performed by last operation
*/
int64_t get_last_flops() const
{
return _last_flops;
}
}; // end class Block
template< class U >
......
......@@ -6,6 +6,7 @@ template<class> class BlockWP;
};
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/flops.hpp"
#include "fabulous/data/Block.hpp"
namespace fabulous {
......@@ -73,10 +74,11 @@ public:
*
* \param[out] C the C block of the projected matrix to be filled
*/
void compute_C(Block<S> &C)
int64_t compute_C(Block<S> &C)
{
int64_t nb_flops = 0;
if (!_cursor)
return;
return nb_flops;
lapacke::Tgemm( // C := P^{H} * \widetilde{W}
get_size_P(), get_size_W(), get_nb_row(),
......@@ -85,6 +87,8 @@ public:
C.get_ptr(), C.get_leading_dim(),
S{1.0}, S{0.0}
);
nb_flops += lapacke::Tgemm_flops<S>(get_size_P(), get_size_W(), get_nb_row());
lapacke::gemm( // ~W = ~W - P_j*C
get_nb_row(), get_size_W(), get_size_P(),
get_P_ptr(), get_leading_dim(),
......@@ -92,6 +96,8 @@ public:
get_W_ptr(), get_leading_dim(),
S{-1.0}, S{1.0}
);
nb_flops += lapacke::gemm_flops<S>(get_nb_row(), get_size_W(), get_size_P());
return nb_flops;
}
/**
......
......@@ -6,6 +6,7 @@
#include "fabulous/utils/Utils.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/kernel/qrf.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -26,7 +27,7 @@ namespace qr {
* \note R MUST BE already allocated ( Q.get_nb_col() x Q.get_nb_col() )
*/
template<class S, class Matrix>
void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
int64_t InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
{
int M = Q.get_nb_row();
int N = Q.get_nb_col();
......@@ -34,17 +35,21 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
assert( R.get_nb_col() == Q.get_nb_col() );
assert( R.get_nb_col() == R.get_nb_row() );
int64_t nb_flops = 0;
for (int j = 0; j < N; ++j) {
for (int i = 0; i < j; ++i) {
S dot; // dot = DOT(Q_i, Q_j)
A.DotProduct(1, 1, Q.get_vect(i), LD, Q.get_vect(j), LD, &dot, 1);
nb_flops += A.DotProduct(1, 1, Q.get_vect(i), LD,
Q.get_vect(j), LD, &dot, 1);
// Qj = Qj - dot(Qj,Qi)*Qi
lapacke::axpy(M, -dot, Q.get_vect(i), 1, Q.get_vect(j), 1);
nb_flops += lapacke::axpy_flops<S>(M);
R.at(i, j) = dot;
}
auto norm = Q.get_norm(j, A);
nb_flops += Q.get_last_flops();
R.at(j, j) = norm;
if (norm == 0.0) {
#ifndef FABULOUS_IB_EXACT_BREAKDOWN_CONTINUE
......@@ -55,8 +60,11 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
norm = 1.0;
#endif
}
lapacke::scal(M, S{1.0} / norm, Q.get_vect(j), 1); // Qj = (1/n)Qj
lapacke::scal(M, S{1.0} / norm, Q.get_vect(j), 1);
nb_flops += lapacke::scal_flops<S>(M);
// Qj = (1/n)Qj
}
return nb_flops;
}
/**
......@@ -74,7 +82,7 @@ void InPlaceQRFactoMGS_User(Block<S> &Q, Block<S> &R, const Matrix &A)
* or the user passed a block of size (0x0) and R will no be computed
*/
template<class S>
void InPlaceQRFacto(Block<S> &Q, Block<S> &R)
int64_t InPlaceQRFacto(Block<S> &Q, Block<S> &R)
{
int M = Q.get_nb_row();
int N = Q.get_nb_col();
......@@ -87,21 +95,24 @@ void InPlaceQRFacto(Block<S> &Q, Block<S> &R)
std::vector<S> tau;
lapacke::geqrf(M, N, Q.get_ptr(), LD, tau);
int64_t nb_flops = lapacke::geqrf_flops<S>(M, N);
if (R.get_nb_col() != 0) {
lapacke::lacpy( 'U', M, N, Q.get_ptr(), LD,
R.get_ptr(), R.get_leading_dim());
}
lapacke::orgqr(M, N, N, Q.get_ptr(), LD, tau.data());
nb_flops += lapacke::orgqr_flops<S>(M, N, N);
return nb_flops;
}
template<class S>
void OutOfPlaceQRFacto(Block<S> &A, Block<S> &Q, Block<S> &R)
int64_t OutOfPlaceQRFacto(Block<S> &A, Block<S> &Q, Block<S> &R)
{
assert( &Q != &A );
assert( Q.get_ptr() != A.get_ptr() );
Q.copy(A);
InPlaceQRFacto(Q, R);
return InPlaceQRFacto(Q, R);
}
}; // end namespace QR
......
......@@ -20,10 +20,7 @@ void Tgemm(int m, int n, int k,
S *C, int ldc,
S alpha = S{1.0}, S beta = S{0.0})
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
......@@ -33,37 +30,25 @@ void gemm(int m, int n, int k,
S *C, int ldc,
S alpha = S{1.0}, S beta = S{0.0})
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void scal(int N, S coef, S *X, int incX)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void axpy(int N, S coef, const S *X, int incX, S *Y, int incY)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
template<class S>
void dot(int N, const S *X, int incX, const S *Y, int incY, S *ret)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
}
/**
......@@ -81,20 +66,14 @@ void dot(int N, const S *X, int incX, const S *Y, int incY, S *ret)
template<class S>
int trsm(int m, int nrhs, const S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int lacpy(char uplo, int m, int n, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
#ifndef FABULOUS_FLOPS_HPP
#define FABULOUS_FLOPS_HPP
#include <cstdint>
#include "fabulous/utils/Arithmetic.hpp"
namespace fabulous {
namespace lapacke {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
template<class S>
int64_t Tgemm_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t gemm_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t axpy_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t scal_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t dot_flops(int m)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t geqrf_flops(int m, int n)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
template<class S>
int64_t orgqr_flops(int m, int n, int k)
{
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
#pragma GCC diagnostic pop
};
};
#endif // FABULOUS_FLOPS_HPP
......@@ -27,10 +27,7 @@ namespace lapacke {
template<class S>
int gels(int m, int n, int nrhs, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -37,10 +37,7 @@ int ggev(int n, S *A, int lda, S *B, int ldb,
S *vl, int ldvl ,
S *vr, int ldvr)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -10,7 +10,6 @@ namespace lapacke {
/******************* BASIC QR FACTORIZATION ************************/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
......@@ -25,10 +24,7 @@ namespace lapacke {
template<class S>
int geqrf(int m, int n, S *A, int lda, std::vector<S> &tau)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return 1;
}
......@@ -47,10 +43,7 @@ int geqrf(int m, int n, S *A, int lda, std::vector<S> &tau)
template<class S>
int orgqr(int m, int n, int p, S *A, int lda, S *tau)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......@@ -78,10 +71,7 @@ template<class S>
int ormqr(char trans, int m, int n, int k,
const S *A, int lda, const S *tau, S *C, int ldc)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -31,10 +31,7 @@ namespace lapacke {
template< class S >
int tpqrt(int m, int n, int l, int nb, S *A, int lda, S *B, int ldb, Block<S> &T)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......@@ -66,10 +63,7 @@ int tpmqrt(char trans, int m, int n, int k, int l, int nb,
const Block<S> &T, S *A, int lda, S *B, int ldb)
{
static_assert(
sizeof(S) != sizeof(S),
"support for float, double (real and complex) only"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
return -1;
}
......
......@@ -16,7 +16,7 @@ namespace fabulous {
/**
* \brief Orthogonalization methods for BLOCK variants WITHOUT Inexact Breakdown
*/
template< class HESS >
template<class HESS>
class OrthogonalizerBlockSTD : public OrthogonalizerBlockCOMMON
{
private:
......@@ -35,16 +35,17 @@ private:
* \param[in,out] W block to be orthogonalized against current base
* \param ortho_scheme the orthogonalization scheme,
*/
template< class S >
void run(HESS &hess, Base<S> &base, Block<S> &W)
template<class S>
int64_t run(HESS &hess, Base<S> &base, Block<S> &W)
{
hess.increase(W.get_nb_col());
Block<S> H = hess.get_H();
Block<S> R = hess.get_R();
dispatch(base, H, W);
qr::InPlaceQRFacto(W, R);
int64_t nb_flops = dispatch(base, H, W);
nb_flops += qr::InPlaceQRFacto(W, R);
return nb_flops;
}
};
......
......@@ -16,7 +16,7 @@ namespace fabulous {
/**
* \brief Orthogonalization methods for BLOCK variants WITH Inexact Breakdown
*/
template< class HESS >
template<class HESS>
class OrthogonalizerBlockIB : public OrthogonalizerBlockCOMMON
{
private:
......@@ -36,7 +36,7 @@ private:
* \param ortho_scheme the orthogonalization scheme,
*/
template<class S>
void run(HESS &L, Base<S> &base, BlockWP<S> &WP)
int64_t run(HESS &L, Base<S> &base, BlockWP<S> &WP)
{
L.increase(WP.get_size_W());
......@@ -45,9 +45,10 @@ private:
Block<S> C = L.get_C(); // hessenberg's parts
Block<S> W = WP.get_W(); // candidate part
dispatch(base, H, W);
WP.compute_C(C); // Orthogonalisation of (candidate) W against P
qr::InPlaceQRFacto(W, D);
int64_t nb_flops = dispatch(base, H, W);
nb_flops += WP.compute_C(C); // Orthogonalisation of (candidate) W against P
nb_flops += qr::InPlaceQRFacto(W, D);
return nb_flops;
}
}; // end class OrthogonalizerBlockIB
......
......@@ -10,6 +10,8 @@ class OrthogonalizerBlockCOMMON;
#include "fabulous/data/Base.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/orthogonalization/OrthoParam.hpp"
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/flops.hpp"
namespace fabulous {
......@@ -18,9 +20,12 @@ namespace fabulous {
*/
class OrthogonalizerBlockCOMMON : public OrthoParam
{
private:
int64_t _nb_flops;
protected:
OrthogonalizerBlockCOMMON(const OrthoParam &param):
OrthoParam{param}
OrthoParam{param},
_nb_flops{0}
{
}
......@@ -49,6 +54,7 @@ protected:
H.get_ptr(), H.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(nb_vect, W_size, dim);
lapacke::gemm( // W = W - Vm*H
dim, W_size, nb_vect,
......@@ -57,6 +63,7 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, nb_vect);
}
/**
......@@ -86,6 +93,7 @@ protected:
tmp.get_ptr(), tmp.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(nb_vect, W_size, dim);
lapacke::gemm( // W = W - Vm*H
dim, W_size, nb_vect,
......@@ -94,10 +102,13 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, nb_vect);
// Add tmp to hess
for (int j = 0; j < tmp.get_nb_col(); ++j) // H = H + tmp;
for (int j = 0; j < tmp.get_nb_col(); ++j) { // H = H + tmp;
lapacke::axpy(nb_vect, S{1.0}, tmp.get_vect(j), 1, H.get_vect(j), 1);
_nb_flops += lapacke::axpy_flops<S>(nb_vect);
}
}
/**
......@@ -131,6 +142,7 @@ protected:
H_k.get_ptr(), H_k.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(size_block, W_size, dim);
lapacke::gemm( // W = W - V_i * H_{ij}
dim, W_size, size_block,
......@@ -139,6 +151,7 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, size_block);
}
}
......@@ -177,6 +190,7 @@ protected:
tmp.get_ptr(), tmp.get_leading_dim(),
S{1.0}, S{0.0}
);
_nb_flops += lapacke::Tgemm_flops<S>(size_block, W_size, dim);
lapacke::gemm( // W = W - V_i * tmp
dim, W_size, size_block,
......@@ -185,14 +199,17 @@ protected:
W.get_ptr(), W.get_leading_dim(),
S{-1.0}, S{1.0}
);
_nb_flops += lapacke::gemm_flops<S>(dim, W_size, size_block);
Block<S> H_k = H.sub_block(size_block_sum, 0, size_block, W_size);
size_block_sum += size_block;
// Add tmp to hess
for (int j = 0; j < tmp.get_nb_col(); ++j) // H_k = H_k + tmp
for (int j = 0; j < tmp.get_nb_col(); ++j) { // H_k = H_k + tmp
lapacke::axpy(
size_block, S{1.0}, tmp.get_vect(j), 1, H_k.get_vect(j), 1);
_nb_flops += lapacke::axpy_flops<S>(size_block);
}
}
}
......@@ -204,8 +221,10 @@ protected:
* \param[out] H block to write the orthogonalization coeffs
*/
template<class S>
void dispatch(Base<S> &base, Block<S> &H, Block<S> &W)
int64_t dispatch(Base<S> &base, Block<S> &H, Block<S> &W)
{
_nb_flops = 0;
switch (_scheme) {
case OrthoScheme::CGS: CGS(base, H, W); break;
case OrthoScheme::MGS: MGS(base, H, W); break;
......@@ -219,6 +238,7 @@ protected:
break;
default: FABULOUS_FATAL_ERROR("Invalid orthogonalization scheme."); break;
}
return _nb_flops;
}
}; // end class OrthogonalizerBlockCOMMON
......
This diff is collapsed.
......@@ -37,17 +37,20 @@ namespace fabulous {
# define FABULOUS_ARITHMETIC_CHAMELEON_TYPE(morse_type_, morse_trans_)
#endif // FABULOUS_USE_CHAMELEON
#define FABULOUS_DISABLE_INVALID_ARITHMETIC(Type_S_) \
static_assert( \
sizeof(Type_S_) != sizeof(Type_S_), \
"Only float, double, std::complex<float>, and " \
"std::complex<double> precisions will work!" \
); \
/**
* \brief Global constants that depends on the Arithmetic used
*/
template<class S>
struct Arithmetik
{
static_assert(
sizeof(S) != sizeof(S),// always false if instantiated
"Only float, double, std::complex<float>, and "
"std::complex<double> template instantiations are allowed!"
);
FABULOUS_DISABLE_INVALID_ARITHMETIC(S);
};
#define FABULOUS_DEFINE_ARITHMETIC(name_, type_, primary_type_, value_, \
......
......@@ -48,7 +48,7 @@ public:
S at(int, int) { FABULOUS_FATAL_ERROR("should not be reached"); return S{0.0}; }
void MatBlockVect(const Block<S> &input, Block<S> &output,
int64_t MatBlockVect(const Block<S> &input, Block<S> &output,
S alpha = S{1.0}, S beta = S{0.0}) const
{
if ( _user_mvp == nullptr ) {
......@@ -56,7 +56,7 @@ public:
FABULOUS_NOTE("Have you called fabulous_set_mvp() ?!");
FABULOUS_FATAL_ERROR("missing user matrix product; cannot recover");
}
_user_mvp(
return _user_mvp(
_user_env, input.get_nb_col(),
input.get_ptr(), input.get_leading_dim(),
output.get_ptr(), output.get_leading_dim(),
......@@ -82,17 +82,17 @@ public:
bool useRightPrecond() const { return _user_rpc != nullptr; }
template< class BLOCK >
void PrecondBlockVect(const BLOCK &input, BLOCK &output) const
int64_t PrecondBlockVect(const BLOCK &input, BLOCK &output) const
{
assert ( _user_rpc != nullptr );
_user_rpc(
return _user_rpc(
_user_env, input.get_nb_col(),
input.get_ptr(), input.get_leading_dim(),
output.get_ptr(), output.get_leading_dim()
);
}
void DotProduct(int M, int N,
int64_t DotProduct(int M, int N,
const S *A, int lda,
const S *B, int ldb,
S *C, int ldc) const
......@@ -102,20 +102,20 @@ public:
FABULOUS_NOTE("Have you called fabulous_set_dot_product() ?!");
FABULOUS_FATAL_ERROR("missing user dot product; cannot recover");
}
_user_dot( _user_env, M, N, A, lda, B, ldb, C, ldc );
return _user_dot( _user_env, M, N, A, lda, B, ldb, C, ldc );
}
void DotProduct(const Block<S> &A, const Block<S> &B, Block<S> &C) const
int64_t DotProduct(const Block<S> &A, const Block<S> &B, Block<S> &C) const
{
DotProduct( A.get_nb_col(), B.get_nb_col(),
return DotProduct( A.get_nb_col(), B.get_nb_col(),
A.get_ptr(), A.get_leading_dim(),
B.get_ptr(), B.get_leading_dim(),
C.get_tpr(), C.get_leading_dim() );
}
void QRFacto(Block<S> &Q, Block<S> &R) const
int64_t QRFacto(Block<S> &Q, Block<S> &R) const
{
qr::InPlaceQRFactoMGS_User(Q, R, *this);
return qr::InPlaceQRFactoMGS_User(Q, R, *this);
}
}; // end class MatrixApi
......