Commit 3f08f506 authored by MIJIEUX Thomas's avatar MIJIEUX Thomas

add beginning of flops counting instrumentation

parent b90b2958
......@@ -237,20 +237,21 @@ private:
void apply_right_precond(Block<S> &Zj, Matrix &A, Block<S> &W)
{
_logger.notify_mvp_begin();
int64_t flops = 0;
if (A.useRightPrecond()) {
A.PrecondBlockVect(W, Zj);
flops = A.PrecondBlockVect(W, Zj);
} else {
Zj.copy(W);
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
}
template<class Matrix>
void matrix_vector_product(Block<S> &W, Matrix &A, Block<S> &Zj)
{
_logger.notify_mvp_begin();
A.MatBlockVect(Zj, W);
_logger.notify_mvp_end();
int64_t flops = A.MatBlockVect(Zj, W);
_logger.notify_mvp_end(flops);
_nb_mvp += W.get_nb_col();
}
......
......@@ -198,12 +198,13 @@ private:
void apply_right_precond(Block<S> &Zj, Matrix &A, Block<S> &R)
{
_logger.notify_mvp_begin();
int64_t flops = 0;
if (A.useRightPrecond()) {
A.PrecondBlockVect(R, Zj);
flops = A.PrecondBlockVect(R, Zj);
} else {
Zj.copy(R);
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
}
template<class Matrix>
......@@ -213,8 +214,8 @@ private:
Block<S> Qi = Q.sub_block(0, 0, _dim, _nb_kept_direction);
_logger.notify_mvp_begin();
A.MatBlockVect(Pi, Qi); // Qi <- A * Pi
_logger.notify_mvp_end();
int64_t flops = A.MatBlockVect(Pi, Qi); // Qi <- A * Pi
_logger.notify_mvp_end(flops);
_nb_mvp += Pi.get_nb_col();
}
......
......@@ -99,28 +99,30 @@ private:
R.copy(B); // R <- B
_logger.notify_mvp_begin();
A.MatBlockVect(X, R); // R <- A*X
int64_t flops;
flops = A.MatBlockVect(X, R); // R <- A*X
for (int j = 0; j < R.get_nb_col(); ++j) {
for (int i = 0; i < R.get_nb_row(); ++i) {
R.at(i, j) = B.at(i, j) - R.at(i, j); // R -> B - A*X
}
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += X.get_nb_col();
}
template< class Matrix, class Block >
void W_AMV(Matrix &A, Block &V, Block &W) // W = A*M*V
{
int64_t flops;
_logger.notify_mvp_begin();
if (A.useRightPrecond()) {
Block MV{W.get_nb_row(), V.get_nb_col()};
A.PrecondBlockVect(V, MV); // MV = M * V
A.MatBlockVect(MV, W); // W = A*MV
flops = A.PrecondBlockVect(V, MV); // MV = M * V
flops += A.MatBlockVect(MV, W); // W = A*MV
} else {
A.MatBlockVect(V, W); // W = A * V
flops = A.MatBlockVect(V, W); // W = A * V
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += V.get_nb_col();
}
......
......@@ -110,13 +110,13 @@ private:
{
R.copy(B); // R <- B
_logger.notify_mvp_begin();
A.MatBlockVect(X, R); // R <- A*X
int64_t flops = A.MatBlockVect(X, R); // R <- A*X
for (int j = 0; j < R.get_nb_col(); ++j) {
for (int i = 0; i < R.get_nb_row(); ++i) {
R.at(i, j) = B.at(i, j) - R.at(i, j); // R -> B - A*X
}
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += X.get_nb_col();
}
......@@ -124,16 +124,16 @@ private:
void W_AMV(Matrix &A, Block &V, BlockWP &WP) // W = A*M*V
{
Block W = WP.get_W();
int64_t flops;
_logger.notify_mvp_begin();
if (A.useRightPrecond()) {
Block MV{W.get_nb_row(), V.get_nb_col()};
A.PrecondBlockVect(V, MV); // MV = M * V
A.MatBlockVect(MV, W); // B = A*MV
flops = A.PrecondBlockVect(V, MV); // MV = M * V
flops += A.MatBlockVect(MV, W); // B = A*MV
} else {
A.MatBlockVect(V, W); // W = A * V
flops = A.MatBlockVect(V, W); // W = A * V
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += V.get_nb_col();
}
......@@ -291,10 +291,12 @@ private:
return;
}
_ib_update_scheduled = false;
_logger.notify_ib_begin();
if (_nb_direction_discarded == 0) {
Block<S> W = _base.get_W(_nbRHS);
W.copy(_WP);
_base.increase(_nbRHS);
_logger.notify_ib_end(0);
return;
}
......@@ -326,6 +328,7 @@ private:
// L_{j+1,} = | \W_1^{H} | * H_j
// Gj = | \W_2^{H} | * H_j
_F.update_bottom_line(W1_W2, _nb_direction_kept);
_logger.notify_ib_end( - 1 /* FIXME */);
}
void do_R_criterion(int proj_convergence,
......@@ -335,6 +338,8 @@ private:
Block<S> Y = _F.alloc_least_square_sol();
Block<S> PR{_F.get_nb_hess_line(), _nbRHS};
_F.solve_least_square(Y);
_logger.notify_ib_begin();
_F.compute_proj_residual(PR, Y);
Block<S> U;
......@@ -358,6 +363,7 @@ private:
}
}
_ib_update_scheduled = true;
_logger.notify_ib_end( -1 /*FIXME */);
}
/*********************************************************/
......
......@@ -110,31 +110,33 @@ private:
void W_AMV(Matrix &A, Block &V, BlockWP &WP) // W = A*M*V
{
Block W = WP.get_W();
_logger.notify_mvp_begin();
int64_t flops;
if (A.useRightPrecond()) {
Block MV{W.get_nb_row(), V.get_nb_col()};
A.PrecondBlockVect(V, MV); // MV = M * V
A.MatBlockVect(MV, W); // B = A*MV
flops = A.PrecondBlockVect(V, MV); // MV = M * V
flops += A.MatBlockVect(MV, W); // B = A*MV
} else {
A.MatBlockVect(V, W); // W = A * V
flops = A.MatBlockVect(V, W); // W = A * V
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += V.get_nb_col();
}
template<class Matrix, class Block>
void compute_real_residual(Matrix &A, Block &X, const Block &B, Block &R)
{
R.copy(B); // R <- B
_logger.notify_mvp_begin();
A.MatBlockVect(X, R); // R <- A*X
R.copy(B); // R <- B
int64_t flops = A.MatBlockVect(X, R); // R <- A*X
for (int j = 0; j < R.get_nb_col(); ++j) {
for (int i = 0; i < R.get_nb_row(); ++i) {
R.at(i, j) = B.at(i, j) - R.at(i, j); // R -> B - A*X
}
}
_logger.notify_mvp_end();
_logger.notify_mvp_end(flops);
_nb_mvp += X.get_nb_col();
}
......@@ -338,7 +340,7 @@ private:
// L_{j+1,} = | \W_1^{H} | * H_j
// Gj = | \W_2^{H} | * H_j
_F.update_bottom_line(W1_W2);
_logger.notify_ib_end();
_logger.notify_ib_end(0);
// hook that may perform incremental factorization
_F.after_bottom_line_update(_nb_direction_kept);
......@@ -379,7 +381,7 @@ private:
}
}
_ib_update_scheduled = true;
_logger.notify_ib_end();
_logger.notify_ib_end( - 1/*FIXME*/ );
}
/*********************************************************/
......
......@@ -333,7 +333,6 @@ public:
return std::make_tuple(mm.first, mm.second, mm.second < epsilon[0]);
}
/*! \brief compare norm of each vector against values in epsilon
*
* if epsilon.size() == 1; then epsilon[0] is used for all vectors;
......
......@@ -107,7 +107,8 @@ public:
#else
FABULOUS_THROW(
Unsupported,
"You must define the FABULOUS_DEBUG_MODE preprocessor constant to use this function"
"You must define the FABULOUS_DEBUG_MODE preprocessor"
" constant to use this function"
);
(void) Hm;
(void) m;
......@@ -169,9 +170,10 @@ private:
_logger.notify_facto_begin();
MORSE_enum trans = Arithmetik<S>::mtrans;
int64_t flops = 0;
/* STEP 1: Loop over each block in last column to
* apply already computed Householders
* apply already computed Householders vector
* This is almost the only loop that can benefit from
* Chameleon Asynchronicity
*/
......@@ -182,6 +184,7 @@ private:
MORSE_desc_t *T = _tau[i].get();
int err = chameleon::ormqr<S>(trans, A, T, C, _seq.get());
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'step' err="<<err);
}
......@@ -192,6 +195,7 @@ private:
MORSE_desc_t *A = get_sub_hess(k, k, 2, 1);
int err = chameleon::geqrf<S>(A, tau, _seq.get());
flops += lapacke::geqrf_flops<S>(_nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "geqrf 'last block' err="<<err);
}
......@@ -202,12 +206,12 @@ private:
/* STEP 3: Apply Q^H generated at step 2 to last block of RHS */
MORSE_desc_t *C = get_sub_rhs(k, 0, 2, 1);
err = chameleon::ormqr<S>(trans, A, tau.get(), C, _seq.get());
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'RHS' err="<<err);
}
_last_column_factorized = true;
_logger.notify_facto_end();
_logger.notify_facto_end(flops);
}
public:
......@@ -298,7 +302,8 @@ public:
MORSE_Sequence_Wait(_seq.get());
MORSE_Tile_to_Lapack(YYd, _Y.get_ptr(), _Y.get_leading_dim());
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::trsm_flops<S>(_nb_vect, _nbRHS);
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(const Block<S>&, const Block<S>&)
......
......@@ -26,19 +26,13 @@ template<class S> class HessChamQR_submat;
#include "fabulous/utils/Logger.hpp"
#include "fabulous/kernel/chameleon.hpp"
#include "fabulous/kernel/flops.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/data/MorseDesc2.hpp"
namespace fabulous {
namespace bgmres {
inline std::string make_quad_string_key(int i, int j, int m, int n)
{
std::stringstream ss;
ss<<i<<"|"<<j<<"|"<<m<<"|"<<n;
return ss.str();
}
/*!
* \brief Hessenberg for Incremental QR version with chamelon back-end
*
......@@ -68,8 +62,8 @@ private:
Logger<P> &_logger;
int _max_nb_iter; /*!< maximum number of iterations */
int _mb; /*!< max number of (big) block line (_nbRHSx_nbRHS) in hessenberg */
int _nb; /*!< max number of (big) block column (_nbRHSx_nbRHS) in hessenberg */
int _mt; /*!< max number of (big) block line (_nbRHSx_nbRHS) in hessenberg */
int _nt; /*!< max number of (big) block column (_nbRHSx_nbRHS) in hessenberg */
int _nbRHS; /*!< number of right-hand-sides; AND size of one (big) block */
int _nb_block_col; /*!< number of block columns in current hessenberg */
int _nb_vect; /*!< number of columns in current hessenberg */
......@@ -163,6 +157,7 @@ private:
_logger.notify_facto_begin();
MORSE_enum trans = Arithmetik<S>::mtrans;
int64_t flops = 0;
/* STEP 1: Loop over each block in last column to
* apply already computed Householders
......@@ -176,6 +171,7 @@ private:
MORSE_desc_t *T = _tau[i].get();
int err = chameleon::ormqr<S>(trans, A.get(), T, C.get(), _seq.get());
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'step' err="<<err);
}
......@@ -186,6 +182,7 @@ private:
MorseDesc2<S> A = get_sub_hess(k, k, 2, 1);
int err = chameleon::geqrf<S>(A.get(), tau, _seq.get());
flops += lapacke::geqrf_flops<S>(_nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "geqrf 'last block' err="<<err);
}
......@@ -196,36 +193,37 @@ private:
/* STEP 3: Apply Q^H generated at step 2 to last block of RHS */
MorseDesc2<S> C = get_sub_rhs(k, 0, 2, 1);
err = chameleon::ormqr<S>(trans, A.get(), tau.get(), C.get(), _seq.get());
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'RHS' err="<<err);
}
_last_column_factorized = true;
_logger.notify_facto_end();
_logger.notify_facto_end(flops);
}
public:
HessChamQR_submat(int max_krylov_space_size, int nbRHS, Logger<P> &logger):
_logger{logger},
_max_nb_iter{ (max_krylov_space_size/nbRHS) +1 },
_mb{ _max_nb_iter+1 },
_nb{ _max_nb_iter },
_mt{ _max_nb_iter+1 },
_nt{ _max_nb_iter },
_nbRHS{nbRHS},
_nb_block_col{0},
_nb_vect{0},
_hessenbergs_tiles{_mb, _nb, nbRHS},
_rhs_tiles{_mb, 1, nbRHS},
_rhs_tmp_tiles{_mb, 1, nbRHS},
_hess{MorseDesc2<S>::create_papa_descriptor(_hessenbergs_tiles, 0, 0, _mb, _nb)},
_rhs{MorseDesc2<S>::create_papa_descriptor(_rhs_tiles, 0, 0, _mb, 1)},
_rhs_tmp{MorseDesc2<S>::create_papa_descriptor(_rhs_tmp_tiles, 0, 0, _mb, 1)},
_hessenbergs_tiles{_mt, _nt, nbRHS},
_rhs_tiles{_mt, 1, nbRHS},
_rhs_tmp_tiles{_mt, 1, nbRHS},
_hess{MorseDesc2<S>::create_papa_descriptor(_hessenbergs_tiles, 0, 0, _mt, _nt)},
_rhs{MorseDesc2<S>::create_papa_descriptor(_rhs_tiles, 0, 0, _mt, 1)},
_rhs_tmp{MorseDesc2<S>::create_papa_descriptor(_rhs_tmp_tiles, 0, 0, _mt, 1)},
#ifdef FABULOUS_DEBUG_MODE
_original{_mb*nbRHS, _nb*nbRHS},
_original{_mt*nbRHS, _nt*nbRHS},
#endif
_last_column_factorized{false},
_solution_computed{false},
_Y{},
_HR{}
_Y{_nt*_nbRHS, _nbRHS},
_HR{ _mt*_nbRHS, _nbRHS}
{
_tau.reserve(_max_nb_iter);
MORSE_sequence_t *sequence = nullptr;
......@@ -265,9 +263,7 @@ public:
Block<S> alloc_least_square_sol()
{
if (!_solution_computed)
_Y = Block<S>{_nb_vect, _nbRHS};
return _Y;
return _Y.sub_block(0, 0, _nb_vect, _nbRHS);
}
template< class Block >
......@@ -288,11 +284,12 @@ public:
// compute the solution
chameleon::trsm<S>(R.get(), Lambda_tmp.get(), _seq.get());
int flops = lapacke::trsm_flops<S>(_nb_vect, _nbRHS);
MORSE_Tile_to_Lapack(Lambda_tmp.get(), _Y.get_ptr(), _Y.get_leading_dim());
_solution_computed = true;
_logger.notify_least_square_end();
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(const Block<S>&, const Block<S>&)
......@@ -343,7 +340,7 @@ public:
FABULOUS_ASSERT( block_size == _nbRHS );
++ _nb_block_col;
_nb_vect += _nbRHS;
_HR = Block<S>{_nb_vect+_nbRHS, _nbRHS};
//_HR = Block<S>{_nb_vect+_nbRHS, _nbRHS};
_last_column_factorized = false;
_solution_computed = false;
......
......@@ -136,7 +136,8 @@ public:
}
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::gels_flops<S>(M, N, _nbRHS);
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(Block<S> &LS, Block<S> &Y)
......
......@@ -133,7 +133,8 @@ public:
}
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::gels_flops<S>(M, N, _nbRHS);
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(Block<S> &LS, Block<S> &Y)
......@@ -164,7 +165,6 @@ public:
Block<S> residual = _YY.sub_block(_nb_vect, 0, _nbRHS, _nbRHS);
auto MinMaxConv = residual.check_precision(epsilon);
return MinMaxConv;
}
......
......@@ -275,7 +275,8 @@ public:
FABULOUS_THROW(Kernel, "gels (least square) err="<<err);
}
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::gels_flops<S>(M, N, _nbRHS);
_logger.notify_least_square_end(flops);
}
/**
......
......@@ -340,11 +340,12 @@ public:
F.get_ptr(), F.get_leading_dim(),
_YY.get_ptr(), _YY.get_leading_dim()
);
int64_t flops = lapacke::gels_flops<S>(M, N, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "gels (least square) err="<<err);
}
_solution_computed = true;
_logger.notify_least_square_end();
_logger.notify_least_square_end(flops);
}
/**
......@@ -377,8 +378,11 @@ public:
_logger.notify_least_square_begin();
Block<S> residual = _YY.sub_block(_nb_vect, 0, _nbRHS, _nbRHS);
auto MinMaxConv = residual.check_precision(epsilon);
_logger.notify_least_square_end();
int64_t flops = 2L*_nbRHS;
if ( is_real_t<S>::value ) {
flops = flops * 4L;
}
_logger.notify_least_square_end(flops);
return MinMaxConv;
}
......
......@@ -67,6 +67,7 @@ private:
/* STEP 1: Loop over each block in last column to
apply already computed Householders */
char Trans = Arithmetik<S>::ltrans;
int64_t flops = 0;
int k = _nb_block_col-1;
for (int i = 0; i < k; ++i) {
......@@ -79,6 +80,7 @@ private:
A.get_ptr(), A.get_leading_dim(), tau,
C.get_ptr(), C.get_leading_dim()
);
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'step' err="<<err);
}
......@@ -92,6 +94,7 @@ private:
int err = lapacke::geqrf( A.get_nb_row(), A.get_nb_col(),
A.get_ptr(), A.get_leading_dim(), tau);
flops += lapacke::geqrf_flops<S>(_nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "geqrf 'last block' err="<<err);
}
......@@ -103,11 +106,12 @@ private:
A.get_ptr(), A.get_leading_dim(), tau.data(),
C.get_ptr(), C.get_leading_dim()
);
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'RHS' err="<<err);
}
_last_column_factorized = true;
_logger.notify_facto_end();
_logger.notify_facto_end(flops);
}
public:
......@@ -176,14 +180,16 @@ public:
FABULOUS_DEBUG("R.row = " << R.get_nb_row());
*/
int M = R.get_nb_row();
int NRHS = Lambda.get_nb_col();
lapacke::trsm(
R.get_nb_row(), Lambda.get_nb_col(),
M, NRHS,
R.get_ptr(), R.get_leading_dim(),
_Y.get_ptr(), _Y.get_leading_dim()
);
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::trsm_flops<S>(M, NRHS);
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(const Block<S>&, const Block<S>&)
......
......@@ -87,6 +87,7 @@ private:
/* STEP 1: Loop over each block in last column to
apply already computed Householders */
char Trans = Arithmetik<S>::ltrans;
int64_t flops = 0;
int k = _nb_block_col-1;
for (int i = 0; i < k; ++i) {
......@@ -98,6 +99,7 @@ private:
A.get_ptr(), A.get_leading_dim(), tau,
C.get_ptr(), C.get_leading_dim()
);
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'step' err="<<err);
}
......@@ -109,6 +111,7 @@ private:
Block<S> A = this->qr_sub_block(k, k);
int err = lapacke::geqrf( A.get_nb_row(), A.get_nb_col(),
A.get_ptr(), A.get_leading_dim(), tau);
flops += lapacke::geqrf_flops<S>(_nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "geqrf 'last block' err="<<err);
}
......@@ -120,12 +123,13 @@ private:
A.get_ptr(), A.get_leading_dim(), tau.data(),
C.get_ptr(), C.get_leading_dim()
);
flops += lapacke::ormqr_flops<S>(_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "ormqr 'RHS' err="<<err);
}
_last_column_factorized = true;
_logger.notify_facto_end();
_logger.notify_facto_end(flops);
}
Block<S> rebuild_original()
......@@ -219,13 +223,17 @@ public:
Block<S> Lambda = _Lambda.sub_block(0, 0, _nb_vect, _nbRHS);
Y.copy(Lambda);
int M = R.get_nb_row();
int NRHS = Lambda.get_nb_col();
lapacke::trsm(
R.get_nb_row(), Lambda.get_nb_col(),
M, NRHS,
R.get_ptr(), R.get_leading_dim(),
_Y.get_ptr(), _Y.get_leading_dim()
);
_solution_computed = true;
_logger.notify_least_square_end();
int64_t flops = lapacke::trsm_flops<S>(M, NRHS);
_logger.notify_least_square_end(flops);
}
void compute_proj_residual(Block<S> &LS, Block<S> &Y)
......
......@@ -285,9 +285,9 @@ private:
int N = B.get_nb_col();
int K = V.get_nb_col();
assert( A.get_nb_row() == K );
assert( A.get_nb_col() == N );
assert( V.get_nb_row() == M );
FABULOUS_ASSERT( A.get_nb_row() == K );
FABULOUS_ASSERT( A.get_nb_col() == N );
FABULOUS_ASSERT( V.get_nb_row() == M );
int L = 0;
int NB = FABULOUS_MIN(_nbRHS, K);
......@@ -311,7 +311,7 @@ private:
}
_last_column_factorized = true;
_logger.notify_facto_end();
_logger.notify_facto_end( -1 /*FIXME */);
}
public:
......@@ -551,7 +551,7 @@ public:
get_ptr(), get_leading_dim(),
Hj.get_ptr(), Hj.get_leading_dim(), T );
_tau_t.push_back(T);
_logger.notify_facto_end();
_logger.notify_facto_end( -1 /*FIXME*/);
}
Block<S> alloc_least_square_sol()
......@@ -606,7 +606,7 @@ public:
}
_solution_computed = true;
_logger.notify_least_square_end();
_logger.notify_least_square_end( -1 /* FIXME */);
}
/*!
......
......@@ -68,18 +68,71 @@ int64_t dot_flops(int64_t m)
return 8L*m; /* 2*m (complex additions) + 6*m (complex multiplication) */
}
template<class S>
template<class S, class = enable_if_t<is_real_t<S>::value>>
int64_t geqrf_flops(int64_t m, int64_t n)
{
FABULOUS_ASSERT( m >= n );
return (2L*m*n*n) - (2.0/3.0) * (n*n*n);
}
template<class S>
int64_t orgqr_flops(int m, int n, int k)
template<class S, class = enable_if_t<is_complex_t<S>::value>, class=void>
int64_t geqrf_flops(int64_t m, int64_t n)
{
FABULOUS_ASSERT( m >= n );
return (8L*m*n*n) - (8.0/3.0) * (n*n*n);
}
template<class S, class = enable_if_t<is_real_t<S>::value>>
int64_t orgqr_flops(int64_t m, int64_t n, int64_t k)
{
FABULOUS_ASSERT( m >= n );
FABULOUS_ASSERT( n == k );
return (2L*m*n*k) - (2.0/3.0) * (k*k*k);
}
template<class S, class = enable_if_t<is_complex_t<S>::value>, class=void>
int64_t orgqr_flops(int64_t m, int64_t n, int64_t k)
{
FABULOUS_ASSERT( m >= n );
FABULOUS_ASSERT( n == k );
return (8L*m*n*k) - (8.0/3.0) * (k*k*k);
}
template<class S, class = enable_if_t<is_real_t<S>::value>>
int64_t ormqr_flops(int64_t m, int64_t n, int64_t k)
{
FABULOUS_ASSERT( m >= n );
FABULOUS_ASSERT( n == k );
return (2L*m*n*k) - (2.0/3.0) * (k*k*k);
}
template<class S, class = enable_if_t<is_complex_t<S>::value>, class=void>
int64_t ormqr_flops(int64_t m, int64_t n, int64_t k)
{
FABULOUS_ASSERT( m >= n );
FABULOUS_ASSERT