Commit a7f0b3bc authored by MIJIEUX Thomas's avatar MIJIEUX Thomas
Browse files

Add version with single hessenberg in distributed systems

parent b9719ff5
......@@ -68,6 +68,10 @@ target_include_directories(fabulous_cpp INTERFACE
target_compile_definitions(fabulous_cpp INTERFACE
HAVE_LAPACK_CONFIG_H
LAPACK_COMPLEX_CPP
MKL_Complex8=std::complex<float>
MKL_Complex16=std::complex<double>
LAPACKE_malloc=malloc
LAPACKE_free=free
)
# dependency on CBLAS/LAPACKE purposely not added here
target_compile_features(fabulous_cpp INTERFACE
......
......@@ -826,3 +826,7 @@ Le 14/06/2017 à 14:34, Thomas Mijieux a écrit :
CLOCK: [2017-06-28 Wed 17:16]--[2017-06-28 Wed 17:18] => 0:02
:END:
[2017-06-28 Wed 17:16]
** 2017-07 July
*** work on generic vectors integration in fabulous to make the code work with mpf
mpf = airbus linear algebra library for elfipole
elfipole = electromagnetic computation
#ifndef FABULOUS_ALGO_TYPE_HPP
#define FABULOUS_ALGO_TYPE_HPP
#include "fabulous/utils/Meta.hpp"
#include <fabulous/arnoldi/ArnoldiIB.hpp>
#include <fabulous/arnoldi/ArnoldiDR.hpp>
#include <fabulous/arnoldi/ArnoldiIBDR.hpp>
......@@ -10,7 +8,6 @@
#include <fabulous/arnoldi/BCGIterations.hpp>
#include <fabulous/arnoldi/BCGIterations2.hpp>
namespace fabulous {
#define FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(Name_, Type_) \
......@@ -22,6 +19,7 @@ namespace fabulous {
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(BGMRES, bgmres::ARNOLDI_STD)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(QR-BGMRES, bgmres::ARNOLDI_QR)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(IB-BGMRES, bgmres::ARNOLDI_IB)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(IB-BGMRES_dist, bgmres::ARNOLDI_IB_DIST)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(QR-BGMRES, bgmres::ARNOLDI_QRDR)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(IB-BGMRES, bgmres::ARNOLDI_IBDR)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(QR-IB-BGMRES, bgmres::ARNOLDI_QRIB)
......
......@@ -22,6 +22,7 @@ template<class Hessenberg, class S> class ArnoldiIB;
#include <fabulous/hessenberg/HessIB.hpp>
#include <fabulous/hessenberg/HessIB_CHAM.hpp>
#include <fabulous/hessenberg/HessIB_dist.hpp>
namespace fabulous {
namespace bgmres {
......@@ -522,12 +523,22 @@ inline auto ib() { return ARNOLDI_IB{}; }
class ARNOLDI_CHAM_IB {};
inline auto cham_ib() { return ARNOLDI_CHAM_IB{}; }
/*! \brief NOT TEMPLATE designator for ArnoldiIB<HessIB_DIST<S>, S> */
class ARNOLDI_IB_DIST {};
inline auto ib_dist() { return ARNOLDI_IB_DIST{}; }
template<class S>
auto make_arnoldi(ARNOLDI_IB, Logger &logger, int nbRHS, const Parameters &param)
{
return ArnoldiIB<HessIB<S>, S>{logger, nbRHS, param};
}
template<class S>
auto make_arnoldi(ARNOLDI_IB_DIST, Logger &logger, int nbRHS, const Parameters &param)
{
return ArnoldiIB<HessIB_DIST<S>, S>{logger, nbRHS, param};
}
#ifdef FABULOUS_USE_CHAMELEON
template<class S>
auto make_arnoldi(ARNOLDI_CHAM_IB, Logger &logger, int nbRHS, const Parameters &param)
......
#ifndef FABULOUS_HESS_IB_DIST_HPP
#define FABULOUS_HESS_IB_DIST_HPP
#include <memory>
namespace fabulous {
namespace bgmres {
template<class> class HessIB_DIST;
}
}
#include "fabulous/data/Block.hpp"
#include "fabulous/kernel/blas.hpp"
#include "fabulous/kernel/gels.hpp"
#include "fabulous/utils/Logger.hpp"
#include "fabulous/utils/mpi.hpp"
namespace fabulous {
namespace bgmres {
/**
* \brief Hessenberg for IB_DIST-BGMRes version.
*
* This hold the matrix denoted as \f$\mathscr{F}\f$ in the paper.
*
*/
template<class S>
class HessIB_DIST
{
public:
using value_type = typename Arithmetik<S>::value_type;
using primary_type = typename Arithmetik<S>::primary_type;
private:
using P = primary_type;
private:
Logger &_logger;
const int _rank;
const int _comm_size;
const int _nbRHS; /*!< number of right hand sides */
const int _max_krylov_space_size; /*!< size of allocated number of columns */
int _nb_vect; /**< number of vector inside the hessenberg */
int _nb_block_col; /**< number of block columns */
std::vector<int> _block_size; /**< size of each block */
Block<S> _data; /*!< the hessenberg data */
Block<S> _data_tmp; /*!< the hessenberg data for solve purpose */
// needed to compute the rhs for Least square:
std::shared_ptr<Block<S>> _phi; /**< Ref : IB-BGMRES-DR : Page 10. */
Block<S> _lambda1; /**< initial value of RHS (projected problem) */
bool _solution_computed;
bool _solution_broadcasted;
Block<S> _Y_buf; /*!< least square solution buffer */
Block<S> _Y; /*!< least square solution */
public:
/**
* \brief create \f$\mathscr{F}\f$
* \param max_krylov_space_size maximum size of Krylov search space
* \param nbRHS number of right hand side
* \param logger object logging the application trace and floating point operations count
*/
HessIB_DIST(int max_krylov_space_size, int nbRHS, Logger &logger):
_logger{logger},
_rank{compute_rank()},
_comm_size{compute_comm_size()},
_nbRHS{nbRHS},
_max_krylov_space_size{max_krylov_space_size},
_nb_vect{0},
_nb_block_col{0},
_data{ _max_krylov_space_size+_nbRHS,
(_rank == 0) ? _max_krylov_space_size : _nbRHS , "hessenberg_ib_dist"},
_data_tmp{ (_rank == 0) ? (_max_krylov_space_size+_nbRHS) : 0,
/**/ (_rank == 0) ? _max_krylov_space_size : 0 , "hessenberg_ib_dist_tmp" },
_phi{nullptr},
_solution_computed{false},
_solution_broadcasted{false},
_Y_buf{_max_krylov_space_size+_nbRHS, _nbRHS},
_Y{}
{
}
private:
void initialize_mpi() const
{
int initialized, provided;
MPI_Initialized(&initialized);
if (!initialized) {
MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
}
}
int compute_rank() const
{
initialize_mpi();
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
return rank;
}
int compute_comm_size() const
{
initialize_mpi();
int comm_size;
MPI_Comm_rank(MPI_COMM_WORLD, &comm_size);
return comm_size;
}
/**
* \brief \f$ \mathscr{F} \f$: the whole projected matrix
* \return \f$ \mathscr{F} \f$
*/
Block<S> get_F()
{
FABULOUS_ASSERT( _rank == 0 );
const int M = _nb_vect + _nbRHS;
const int N = _nb_vect;
return _data.sub_block( 0, 0, M, N );
}
Block<S> get_F_tmp()
{
const int M = _nb_vect + _nbRHS;
const int N = _nb_vect;
Block<S> F = _data.sub_block( 0, 0, M, N );
Block<S> F_copy = _data_tmp.sub_block( 0, 0, M, N );
F_copy.copy(F);
return F_copy;
}
/**
* \brief \f$ \mathbb{H}_j \f$: the bottom part of \f$ \mathscr{F} \f$
* \return \f$ \mathbb{H}_j \f$
*/
Block<S> get_Hj()
{
const int I = _nb_vect;
const int M = _nbRHS;
const int N = _nb_vect;
return _data.sub_block( I, 0, M, N );
}
/**
* \brief Compute \f$\Lambda\f$ using \f$\Phi\f$ and \f$\Lambda_1\f$
*
* \param[out] Lambda \f$\Lambda = \Phi * \Lambda_1\f$
*/
int64_t compute_Lambda(Block<S> &Lambda)
{
if (_rank > 0 || !_phi) { // no inexact breakdown on R0
Lambda.copy(_lambda1);
return 0;
}
Lambda = (*_phi) * _lambda1;
return lapacke::flops::gemm<S>(_phi->get_nb_row(), _nbRHS, _nbRHS);
}
/**
* \brief Check if there is room to another iteration
*/
bool check_room(int block_size) const
{
return (_nb_vect + block_size <= _max_krylov_space_size);
}
public:
/** \brief number of vector in \f$\mathscr{F}\f$ */
int get_nb_vect() const { return _nb_vect; }
/** \brief number of lines in \f$\mathscr{F}\f$ */
int get_nb_hess_line() const { return _nb_vect + _nbRHS; }
/**
* \brief set the initial value for the right hand side of
* the LeastSquare problem (projected problem)
* \param[in] lambda initial RHS for projected problem;
* R part of QR factorization of initial residual R0
*/
void init_lambda(const Block<S> &lambda)
{
if (_rank > 0) {
return;
}
//Check
if (lambda.get_nb_col() != _nbRHS || lambda.get_nb_row() != _nbRHS) {
FABULOUS_THROW(
Internal,
"Lambda dimensions are wrong while initiating lambda in Hess\n"
);
}
_lambda1 = lambda;
}
/**
* \brief Initialize \f$ \Phi \f$ (refer to Annex, Eq 2.5)
* \param p1 number of direction kept in the inexact breakdown on R0
*/
void init_phi(int p1)
{
if (_rank > 0) {
return;
}
if (_phi) {
FABULOUS_THROW(Internal, "This function cannot be called twice!");
}
const int N = _lambda1.get_nb_col();
_phi.reset(new Block<S>{N + p1, N});
for (int i = 0; i < N; ++i) {
_phi->at(i, i) = S{1.0};
}
}
/**
* \brief Update \f$\Phi\f$ if IB_DIST happened on R0
*
* \param[in] W1_W2 \f$ [\mathbb{W}_1, \mathbb{W}_2] \f$ packed in one block
* \param p_jplus1 \f$p_{j+1}\f$ number of directions kept in
* the last inexact breakdown
*
* \todo Could be optimized in order to avoid copying,
* allocating and destroying
*/
int64_t update_phi(const Block<S> &W1_W2, int p_jplus1)
{
if (_rank > 0 || !_phi) {
return 0;
}
/* If _phi has been set, it means that an I.B. happened
* on R0, and we have a modified RHS for least square
* that depends on _phi */
const int NJ = _nb_vect;
const int M = _phi->get_nb_row() + p_jplus1;
Block<S> *new_phi = new Block<S>{M, _nbRHS};
// Init first n_j rows
Block<S> subphi_new = new_phi->sub_block(0, 0, NJ, _nbRHS);
subphi_new.copy(*_phi);
// Compute newphi(nj+1:nj+p,:) := W1_W2^{H} * _phi(nj+1:nj+p,:)
subphi_new = new_phi->sub_block( NJ, 0, _nbRHS, _nbRHS );
Block<S> subphi = _phi->sub_block( NJ, 0, _nbRHS, _nbRHS );
W1_W2.dot(subphi, subphi_new); // subphi_new = W1_W2^{H} * subphi;
_phi.reset(new_phi);
return lapacke::flops::gemm<S>(_nbRHS, _nbRHS, _nbRHS);
}
/**
* \brief display a dot for each non zero coefficient in matrix
*
* Used for debug purpose
*/
void display_hess_extended_bitmap(std::string name = "")
{
std::cerr<<name<<"\n";
for (int j=0; j<_nb_vect+_nbRHS; ++j) {
int col = 0;
for (unsigned int i=0; i<_block_size.size(); ++i) {
for (int t=0; t<_block_size[i]; ++t) {
auto m = std::norm(_data.at(j, col));
if ( m != primary_type{0.0} )
std::cerr<<"."<<" ";
else
std::cerr<<" "<<" ";
col++;
}
std::cerr<<"| ";
}
std::cerr<<std::endl;
}
std::cerr<<std::endl;
}
/**
* \brief increase the size of the Hessenberg
*
* increment the cursors
*/
void increase(int block_size)
{
FABULOUS_ASSERT( check_room(block_size) );
_block_size.push_back(block_size);
++ _nb_block_col;
_nb_vect += block_size;
_solution_computed = false;
_solution_broadcasted = false;
}
/**
* \brief Update \f$ H_j \f$ block into becoming new last row of
* \f$ \mathscr{L} \f$ and new \f$ G_j \f$ block
*
* \f$ L_{j+1,} = W_1^{H} * H_j \f$ <br/>
* \f$ G_j = W_2^{H} * H_j \f$ <br/>
*
* (this can be done in one operation as blocks following each other in memory)
*
* \param[in] W1_W2 \f$ \mathbb{W} = [\mathbb{W}_1, \mathbb{W}_2] \f$
* ( Directions )
*/
int64_t update_bottom_line(const Block<S>& W1_W2)
{
if (_rank > 0) {
return 0;
}
const int M = _nbRHS;
const int N = _nb_vect;
const int K = _nbRHS;
Block<S> Hj = get_Hj();
Block<S> tmp = Hj.copy();
// Save Hj in order to write directly inside F
W1_W2.dot(tmp, Hj); // Hj = W1_W2^{H} * tmp;
return lapacke::flops::gemm<S>(M, N, K);
}
Block<S> alloc_least_square_sol()
{
const int M = _nb_vect;
const int N = _nbRHS;
_Y = _Y_buf.sub_block( 0, 0, M, N );
if (!_solution_computed) {
const int M2 = _nb_vect + _nbRHS;
Block<S> YY = _Y_buf.sub_block( 0 ,0, M2, N );
YY.zero();
}
return _Y;
}
private:
int64_t solve_least_square_local(Block<S> &Y)
{
FABULOUS_ASSERT( Y.get_ptr() == _Y.get_ptr() );
const int M = _nb_vect + _nbRHS;
const int N = _nb_vect;
Block<S> YY = _Y_buf.sub_block( 0, 0, M, _nbRHS );
int64_t flops = compute_Lambda(YY); // Y <- \Lambda; to be overwritten with Y in gels
Block<S> F = get_F_tmp();
// extern int mkl_get_max_threads(void);
mkl_set_dynamic(1);
mkl_set_num_threads(24);
const int err = lapacke::gels( // solve least square
M, N, _nbRHS, // this function overwrites both F and Lambda
F.get_ptr(), F.get_leading_dim(),
YY.get_ptr(), YY.get_leading_dim()
);
mkl_set_dynamic(0);
mkl_set_num_threads(1);
if (err != 0) {
FABULOUS_THROW(Kernel, "gels (least square) err="<<err);
}
flops += lapacke::flops::gels<S>(M, N, _nbRHS);
return flops;
}
public:
void solve_least_square(Block<S> &Y, bool broadcast_solution=true)
{
_logger.notify_least_square_begin();
int64_t flops = 0;
if (!_solution_computed) {
if (_rank == 0) {
flops += solve_least_square_local(Y);
}
_solution_computed = true;
}
if (broadcast_solution && !_solution_broadcasted) {
broadcast_block(Y, _rank, 0);
_solution_broadcasted = true;
}
_logger.notify_least_square_end(flops);
}
/**
* \brief Solve Least Square of projected problem \f$ F*Y-\Lambda \f$
* \return \f$ Y = argmin(||F*Y-\Lambda||) \f$
* \return \f$ LS = F*Y-\Lambda \f$
*/
int64_t compute_proj_residual(Block<S> &LS, Block<S> &Y)
{
FABULOUS_ASSERT( Y.get_ptr() == _Y.get_ptr() );
const int M = _nb_vect + _nbRHS;
const int N = _nbRHS;
const int K = _nb_vect;
int64_t flops = 0;
if (_rank == 0) {
flops = compute_Lambda(LS);// save \Lambda in LS in order to compute LS in the end
const Block<S> F = get_F();
LS -= F * Y;
flops += lapacke::flops::gemm<S>(M, N, K);
}
broadcast_block(LS, _rank, 0);
return flops;
}
std::tuple<P,P,bool>
check_least_square_residual(const std::vector<P> &epsilon)
{
Block<S> Y = alloc_least_square_sol();
solve_least_square(Y, false);
_logger.notify_least_square_begin();
P buf[3];
std::tuple<P,P,bool> MinMaxConv;
std::fill(buf, buf+3, P{0.0});
int64_t flops = 0;
if (_rank == 0) {
const int I = _nb_vect;
const int N = _nbRHS;
const Block<S> residual = _Y_buf.sub_block( I, 0, N, N );
MinMaxConv = min_max_conv(residual, epsilon);
flops = N * lapacke::flops::dot<S>(N);
buf[0] = std::get<0>(MinMaxConv);
buf[1] = std::get<1>(MinMaxConv);
buf[2] = std::get<2>(MinMaxConv) ? P{2.0} : P{0.0};
}
MPI_Bcast(buf, 3*mpi_datatype<P>::multiplier,
mpi_datatype<P>::type(), 0, MPI_COMM_WORLD);
if (_rank != 0) {
MinMaxConv = std::make_tuple(buf[0], buf[1], buf[2] > P{1.0});
}
_logger.notify_least_square_end(flops);
return MinMaxConv;
}
Block<S> get_hess_block()
{
return get_F();
}
/**
* \brief \f$H\f$: last block column of the Hessenberg to be filled by
* the orthogonalization process
* \return \f$H\f$
*/
Block<S> get_H()
{
const int N = _block_size.back();
const int M = _nb_vect;
const int J = _nb_vect - N;
if (_rank == 0) {
return _data.sub_block( 0, J, M, N );
} else {
return _data.sub_block( 0, 0, M, N );
}
}
/*
* Following member function are relative to the |Hj block at the bottom.
*/
/**
* \brief \f$C\f$ part (orthogonalization coefficient of P against W) to be filled
* during orthogonalization process
* \return \f$C\f$
*/
Block<S> get_C()
{
const int N = _block_size.back();
const int M = _nbRHS - N; // nb direction discarded
const int I = _nb_vect;
const int J = _nb_vect - N;
if (_rank == 0) {
return _data.sub_block( I, J, M, N );
} else {
return _data.sub_block( I, 0, M, N );
}
}
/**
* \brief \f$D\f$: R part of QR factorization of \f$ \widetilde{W} \f$
* to be filled by orthogonalization process
* \return \f$D\f$
*/
Block<S> get_D()
{
const int N = _block_size.back();
const int I = _nb_vect + (_nbRHS - N);
const int J = _nb_vect - N;
if (_rank == 0) {
return _data.sub_block( I, J, N, N );
} else {
return _data.sub_block( I, 0, N, N );
}
}
void notify_ortho_end() const {/*nop*/}
}; // end class HessIB_DIST
} // end namespace bgmres
} // end namespace fabulous
#endif // FABULOUS_HESS_IB_DIST_HPP
......@@ -6,9 +6,6 @@
#include "fabulous/utils/Arithmetic.hpp"
#include "fabulous/utils/Utils.hpp"
#include "fabulous/utils/Logger.hpp"
#include "fabulous/ext/cblas.h"