Commit 955f1685 authored by MIJIEUX Thomas's avatar MIJIEUX Thomas
Browse files

write a new version of incremental qr with chameleon

probably a better candidate to use distributed descriptors
parent 85eed46a
......@@ -15,6 +15,7 @@
#include <fabulous/hessenberg/HessQR.hpp>
#include <fabulous/hessenberg/HessQRDR.hpp>
#include <fabulous/hessenberg/HessQR_CHAMSUB.hpp>
#include <fabulous/hessenberg/HessQR_chameleon_subdesc_dist.hpp>
#include <fabulous/hessenberg/HessIB.hpp>
#include <fabulous/hessenberg/HessIB_CHAM.hpp>
#include <fabulous/hessenberg/HessIBDR.hpp>
......@@ -31,6 +32,7 @@ template<class S> using ARNOLDI_QRIBDR = ArnoldiIBDR<HessQRIBDR<S>, S>;
#ifdef FABULOUS_USE_CHAMELEON
template<class S> using ARNOLDI_CHAM_QR = ArnoldiDR<HessQR_CHAMSUB<S>, S>;
template<class S> using ARNOLDI_CHAM_QR_DIST = ArnoldiDR<HessQR_chameleon_subdesc_dist<S>, S>;
template<class S> using ARNOLDI_CHAM_TL = ArnoldiDR<HessDR_CHAM<S>, S>;
template<class S> using ARNOLDI_CHAM_IB = ArnoldiIB<HessIB_CHAM<S>, S>;
#endif // FABULOUS_USE_CHAMELEON
......@@ -43,9 +45,10 @@ inline auto ibdr() { return Algorithm<ARNOLDI_IBDR>{}; }
inline auto qribdr() { return Algorithm<ARNOLDI_QRIBDR>{}; }
#ifdef FABULOUS_USE_CHAMELEON
inline auto cham_qr() { return Algorithm<ARNOLDI_CHAM_QR>{}; }
inline auto cham_tl() { return Algorithm<ARNOLDI_CHAM_TL>{}; }
inline auto cham_ib() { return Algorithm<ARNOLDI_CHAM_IB>{}; }
inline auto cham_qr() { return Algorithm<ARNOLDI_CHAM_QR>{}; }
inline auto cham_tl() { return Algorithm<ARNOLDI_CHAM_TL>{}; }
inline auto cham_ib() { return Algorithm<ARNOLDI_CHAM_IB>{}; }
inline auto cham_qr_dist() { return Algorithm<ARNOLDI_CHAM_QR_DIST>{}; }
#endif // FABULOUS_USE_CHAMELEON
} // end namespace bgmres
......@@ -75,6 +78,7 @@ FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(QR-IB-BGMRES, bgmres::ARNOLDI_QRIBDR)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(CHAM_QR-BGMRES, bgmres::ARNOLDI_CHAM_QR)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(CHAM_TL-BGMRES, bgmres::ARNOLDI_CHAM_TL)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(CHAM_IB-BGMRES, bgmres::ARNOLDI_CHAM_IB)
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(CHAM_QR_DIST-BGMRES, bgmres::ARNOLDI_CHAM_QR_DIST)
#endif // FABULOUS_USE_CHAMELEON
FABULOUS_ALGO_OVERLOAD_STREAM_OPERATOR(BGCR, bgcr::Arnoldi)
......
......@@ -60,18 +60,18 @@ template<class U>
class MorseDesc_p
{
private:
const DataChameleonI &_tiles;
const TileMapperI &_tile_mapper;
DescPtr _descriptor;
FABULOUS_DELETE_COPY_MOVE(MorseDesc_p);
public:
// parent descriptor
MorseDesc_p(const DataChameleonI &p, int mt, int nt):
_tiles{p}
MorseDesc_p(const TileMapperI &tile_mapper, int mt, int nt):
_tile_mapper{tile_mapper}
{
const int tile_size = _tiles.get_inner_tile_size();
const int BS = _tiles.get_block_size();
const int tile_size = _tile_mapper.get_inner_tile_size();
const int BS = _tile_mapper.get_block_size();
const int lm = mt * BS;
const int ln = nt * BS;
......@@ -82,7 +82,7 @@ public:
lm, ln, // whole matrix size
0, 0, // submatrix start
lm, ln, // submatrix size
1, 1,
tile_mapper.get_p(), tile_mapper.get_q(),
desc_get_blk_addr<U>,
desc_get_blk_ld<U>,
desc_get_rankof<U>
......@@ -92,11 +92,11 @@ public:
// sub descriptor
MorseDesc_p(const MorseDesc_p<U> &o, int it, int jt, int mt, int nt):
_tiles(o._tiles)
_tile_mapper(o._tile_mapper)
{
// int tile_size = _tiles.get_tile_size();
// int tile_size = _tile_mapper.get_tile_size();
const int BS = _tiles.get_block_size();
const int BS = _tile_mapper.get_block_size();
const int i = it * BS;
const int j = jt * BS;
const int lm = mt * BS;
......@@ -104,11 +104,10 @@ public:
MORSE_desc_t *desc;
desc = morse_desc_submatrix(o._descriptor.get(), i, j, lm, ln);
_descriptor.reset(desc, DescDeleter{});
// FABULOUS_ASSERT( it <= p._mt );
// FABULOUS_ASSERT( jt <= p._nt );
_descriptor.reset(desc, DescDeleter{});
}
MORSE_desc_t *get()
......@@ -119,19 +118,18 @@ public:
public:
void *get_blk_addr(int i, int j) const
{
return _tiles.get_blk_addr(i, j);
return _tile_mapper.get_blk_addr(i, j);
}
int get_blk_rankof(int i, int j) const
{
return _tiles.get_rank_of(i, j);
return _tile_mapper.get_rank_of(i, j);
}
int get_blk_ld(int i) const
{
return _tiles.get_blk_ld(i);
return _tile_mapper.get_blk_ld(i);
}
}; // end class MorseDesc_p
/**
......@@ -146,8 +144,8 @@ class MorseDesc
private:
std::shared_ptr<MorseDesc_p<U>> _pimpl;
private:
MorseDesc(const DataChameleonI &tiles, int mt, int nt):
_pimpl{new MorseDesc_p<U>{tiles, mt, nt}}
MorseDesc(const TileMapperI &tile_mapper, int mt, int nt):
_pimpl{new MorseDesc_p<U>{tile_mapper, mt, nt}}
{
}
......@@ -157,9 +155,9 @@ private:
}
public:
static MorseDesc create_parent_descriptor(const DataChameleonI &tiles, int mt, int nt)
static MorseDesc create_parent_descriptor(const TileMapperI &tile_mapper, int mt, int nt)
{
return MorseDesc{tiles, mt, nt};
return MorseDesc{tile_mapper, mt, nt};
}
static MorseDesc create_sub_descriptor(const MorseDesc& o, int it, int jt, int mt, int nt)
......
......@@ -15,6 +15,7 @@ template<class> class TileMapper;
template<class> class HessenbergTileMapper;
template<class> class ClassicTileMapper;
template<class> class LapackTileMapper;
template<class> class AutomaticStarpuTileMapper;
}
#include "fabulous/utils/Error.hpp"
......@@ -49,9 +50,9 @@ inline int compute_inner_tile_size(const int nb)
}
/**
* \brief DataInterface for MorseDesc objects (base class)
* \brief DataInterface for tile mappers objects (base class)
*/
class DataChameleonI
class TileMapperI
{
public:
virtual void *get_blk_addr(int i, int j) const = 0;
......@@ -59,8 +60,10 @@ public:
virtual int get_blk_ld(int i) const = 0;
virtual int get_inner_tile_size() const = 0;
virtual int get_block_size() const = 0;
virtual ~DataChameleonI(){}
}; // end class DataChameleonI
virtual int get_p() const { return 1; }
virtual int get_q() const { return 1; }
virtual ~TileMapperI(){}
}; // end class TileMapperI
/**
* \brief Tile container for MorseDesc Object ('HESSENBERG')
......@@ -85,7 +88,7 @@ public:
* as the ClassicTileMapper coordinates system (DPLASMA/Chameleon style).
*/
template<class S>
class HessenbergTileMapper : public DataChameleonI
class HessenbergTileMapper : public TileMapperI
{
private:
const int _mt, _nt;
......@@ -177,7 +180,7 @@ private:
* as the global coordinates system (DPLASMA/Chameleon style).
*/
template<class S>
class ClassicTileMapper : public DataChameleonI
class ClassicTileMapper : public TileMapperI
{
private:
const int _mt, _nt;
......@@ -259,7 +262,7 @@ private:
* as the global coordinates system (DPLASMA/Chameleon style).
*/
template<class S>
class LapackTileMapper : public DataChameleonI
class LapackTileMapper : public TileMapperI
{
private:
const int _mt, _nt;
......@@ -320,6 +323,78 @@ private:
}; // end class LapackTileMapper
/**
* \brief Tile container for MorseDesc Object ('Classic')
* blocks are allocated automatically by the runtime
*/
template<class S>
class AutomaticStarpuTileMapper : public TileMapperI
{
private:
const int _mt, _nt;
const int _nb;
const int _block_bsize;
const int _nb_block;
const int _inner_tile_size;
const int _tile_bsize;
const int _inner_block_tile_ratio;
int _p, _q;
public:
AutomaticStarpuTileMapper(int mt, int nt, int nb):
_mt{mt}, _nt{nt},
_nb{nb},
_block_bsize{nb*nb},
_nb_block{mt*nt},
_inner_tile_size{compute_inner_tile_size(nb)},
_tile_bsize{_inner_tile_size * _inner_tile_size},
_inner_block_tile_ratio{nb / _inner_tile_size}
{
int nb_proc = -1;
MORSE_Comm_size(&nb_proc);
_p = _q = nb_proc;
}
int get_inner_tile_size() const override
{
return _inner_tile_size;
}
int get_block_size() const override
{
return _nb;
}
private:
virtual int get_p() const override
{
return _p;
}
virtual int get_q() const override
{
return _q;
}
void *get_blk_addr(int, int) const override
{
return nullptr;
}
int get_blk_ld(int) const override
{
return _inner_tile_size;
}
int get_rank_of(int m, int n) const override
{
return (m % _p) * _q + (n % _q);
}
}; // end class AutomaticStarpuTileMapper
} // end namespace fabulous
#endif // FABULOUS_TILE_MAPPER_HPP
#ifndef FABULOUS_HESS_QR_CHAM_SUBDESC_DIST_HPP
#define FABULOUS_HESS_QR_CHAM_SUBDESC_DIST_HPP
#ifdef FABULOUS_USE_CHAMELEON
#include <unordered_map>
#include <utility>
#include <memory>
#include <tuple>
#include <vector>
#include <string>
#include <sstream>
#include <cassert>
#include <morse.h>
namespace fabulous {
namespace bgmres {
template<class S> class HessQR_chameleon_subdesc_dist;
}
}
#include "fabulous/utils/Arithmetic.hpp"
#include "fabulous/utils/Error.hpp"
#include "fabulous/utils/Chameleon.hpp"
#include "fabulous/utils/Logger.hpp"
#include "fabulous/kernel/chameleon.hpp"
#include "fabulous/kernel/flops.hpp"
#include "fabulous/data/Block.hpp"
#include "fabulous/data/MorseDesc.hpp"
namespace fabulous {
namespace bgmres {
/*!
* \brief Hessenberg for Incremental QR version with chamelon back-end
*
* This implementation use the 'low level' chameleon API (*_Tile_Async)
*
* Format: <br/>
\verbatim
| R X X |
| H R X |
| H R |
| H |
\endverbatim
* with: <br/>
* R Triangular Sup <br/>
* H Householders reflectors (trapezoidal)
*
* \note only the upper hessenberg block are allocated.
*/
template<class S>
class HessQR_chameleon_subdesc_dist
{
public:
using value_type = typename Arithmetik<S>::value_type;
using primary_type = typename Arithmetik<S>::primary_type;
private:
using P = primary_type;
private:
Logger &_logger;
const int _max_nb_iter; /*!< maximum number of iterations */
const int _mt; /*!< max number of (big) block line (_nbRHSx_nbRHS) in hessenberg */
const int _nt; /*!< max number of (big) block column (_nbRHSx_nbRHS) in hessenberg */
const int _nbRHS; /*!< number of right-hand-sides; AND size of one (big) block */
int _nb_block_col; /*!< number of block columns in current hessenberg */
int _nb_vect; /*!< number of columns in current hessenberg */
std::vector<DescPtr> _tau; /*!< tau factors for each QR factorization*/
AutomaticStarpuTileMapper<S> _hessenbergs_tiles; /*!< the hessenbergs tiles */
AutomaticStarpuTileMapper<S> _rhs_tiles; /*!< the (least square) right-hand-side tiles */
LapackTileMapper<S> _Y_tiles; /*!< Y (gels solution local) */
LapackTileMapper<S> _HR_tiles; /*!< HR ortho coefficient */
LapackTileMapper<S> _lambda1_tiles; /*!< lambda1 initial RHS */
LapackTileMapper<S> _residual_tiles; /*!< used to compute residual norm */
MorseDesc<S> _hess;
MorseDesc<S> _rhs;
MorseDesc<S> _Y;
MorseDesc<S> _HR;
MorseDesc<S> _lambda1;
MorseDesc<S> _residual;
int _Y_acquired;
int _HR_acquired;
SeqPtr _seq; /*!< MORSE sequence object */
#ifdef FABULOUS_DEBUG_MODE
Block<S> _orig; /*!< original (not factorized) hessenberg
stored as Lapack matrix for debug purpose
(more specifically, in the check_arnoldi_formula() function) */
#endif
bool _last_column_factorized; /*!< whether last column was factorized */
bool _solution_computed; /*!< whether solution was computed(for last column) */
Block<S> _Y_block; /*!< least square solution */
Block<S> _HR_block; /*!< last column as filled by orthogonalization */
Block<S> _lambda1_block; /*!< initial rhs for projected problem */
Block<S> _residual_block; /*!< temporary buffer to eval residual norm */
FABULOUS_DELETE_COPY_MOVE(HessQR_chameleon_subdesc_dist);
public:
HessQR_chameleon_subdesc_dist(int max_krylov_space_size, int nbRHS, Logger &logger):
_logger{logger},
_max_nb_iter{ (max_krylov_space_size/nbRHS) +1 },
_mt{ _max_nb_iter+1 },
_nt{ _max_nb_iter },
_nbRHS{nbRHS},
_nb_block_col{0},
_nb_vect{0},
// TILE MAPPERS
_hessenbergs_tiles{_mt, _nt, nbRHS},
_rhs_tiles{_mt, 2, nbRHS},
_Y_tiles{_mt, 1, nbRHS},
_HR_tiles{_mt, 1, nbRHS},
_lambda1_tiles{1, 1, nbRHS},
_residual_tiles{1, 1, nbRHS},
// CHAMELEON DESCRIPTOR
_hess { MorseDesc<S>::create_parent_descriptor(_hessenbergs_tiles, _mt, _nt)},
_rhs { MorseDesc<S>::create_parent_descriptor(_rhs_tiles, _mt, 2)},
_Y { MorseDesc<S>::create_parent_descriptor(_Y_tiles, _mt, 1)},
_HR { MorseDesc<S>::create_parent_descriptor(_HR_tiles, _mt, 1)},
_lambda1 { MorseDesc<S>::create_parent_descriptor(_lambda1_tiles, 1, 1)},
_residual { MorseDesc<S>::create_parent_descriptor(_residual_tiles, 1, 1)},
_Y_acquired{0},
_HR_acquired{0},
#ifdef FABULOUS_DEBUG_MODE
_orig{_mt*nbRHS, _nt*nbRHS},
#endif
_last_column_factorized{false},
_solution_computed{false},
// LOCAL BLOCKS
_Y_block { _Y_tiles.get_block() },
_HR_block{ _HR_tiles.get_block() },
_lambda1_block {_lambda1_tiles.get_block() },
_residual_block{_residual_tiles.get_block() }
{
_tau.reserve(_max_nb_iter);
MORSE_sequence_t *sequence = nullptr;
MORSE_Sequence_Create(&sequence);
_seq.reset(sequence, SeqDeleter{});
}
~HessQR_chameleon_subdesc_dist()
{
while (_HR_acquired > 0) {
MORSE_Desc_Release( _HR.get() );
-- _HR_acquired;
}
while (_Y_acquired > 0) {
MORSE_Desc_Release( _Y.get() );
-- _Y_acquired;
}
}
template<class Block>
void debug_dump_orig(Block &Hm, int m, int n)
{
#ifdef FABULOUS_DEBUG_MODE
Hm = _orig.sub_block(0, 0, m, n);
#else
FABULOUS_THROW(
Unsupported,
"You must define the FABULOUS_DEBUG_MODE preprocessor constant to use this function"
);
(void) Hm; (void) m; (void) n;
#endif
}
private:
MorseDesc<S> get_sub(MorseDesc<S> &parent, int it, int jt, int mt, int nt)
{
return MorseDesc<S>::create_sub_descriptor(parent, it, jt, mt, nt);
}
/*!
* \brief compute qr factorization of the last column of
* the block hessenberg matrix
*/
void factorize_last_column()
{
if (_last_column_factorized) {
return;
}
_logger.notify_facto_begin();
MORSE_enum trans = Arithmetik<S>::mtrans;
namespace fps = lapacke::flops;
int64_t flops = 0;
/* STEP 1: Loop over each block in last column to
* apply already computed Householders
* This is almost the only loop that can benefit from
* Chameleon Asynchronicity
* (and it may require very large nbRHS to actually see a speedup)
*/
const int KT = _nb_block_col-1;
for (int i = 0; i < KT; ++i) {
MorseDesc<S> A = get_sub(_hess, i, i, 2, 1);
MorseDesc<S> C = get_sub(_hess, i, KT, 2, 1);
MORSE_desc_t *T = _tau[i].get();
int err = chameleon::unmqr<S>(trans, A.get(), T, C.get(), _seq.get());
flops += fps::unmqr<S>(2*_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "unmqr 'step' err="<<err);
}
}
/* STEP 2: Call QR on the very last block of last col */
DescPtr tau;
MorseDesc<S> A = get_sub(_hess, KT, KT, 2, 1);
int err = chameleon::geqrf<S>(A.get(), tau, _seq.get());
flops += fps::geqrf<S>(2*_nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "geqrf 'last block' err="<<err);
}
_tau.push_back(tau);
/* STEP 3: Apply Q^H generated at step 2 to last block of RHS */
MorseDesc<S> C = get_sub(_rhs, KT, 0, 2, 1);
err = chameleon::unmqr<S>(trans, A.get(), tau.get(), C.get(), _seq.get());
flops += fps::unmqr<S>(2*_nbRHS, _nbRHS, _nbRHS);
if (err != 0) {
FABULOUS_THROW(Kernel, "unmqr 'RHS' err="<<err);
}
_last_column_factorized = true;
_logger.notify_facto_end(flops);
}
public:
/*!
* \brief set the \f$\Lambda_1\f$ block (which is nbRHS x nbRHS)
* (RHS of least square problem)
*/
void set_rhs(Block<S> &lambda1)
{
FABULOUS_ASSERT( lambda1.get_nb_col() == _nbRHS );
FABULOUS_ASSERT( lambda1.get_nb_row() == _nbRHS );
MORSE_Desc_Acquire( _lambda1.get() );
// lambda1.display("lambda1");
_lambda1_block.copy(lambda1);
MORSE_Desc_Release( _lambda1.get() );
MorseDesc<S> LAMBDA_1_SLOT = get_sub(_rhs, 0, 0, 1, 1);
// equivalent to MORSE_Lapack_to_Tile without extra descriptor creation each time this is called
// ( because this would 'consume' a mpi tag )
chameleon::lacpy<S>(MorseUpperLower, _lambda1.get(), LAMBDA_1_SLOT.get(), _seq.get());
}
Block<S> get_H()
{
auto h = _HR_block.sub_block( 0, 0, _nb_vect, _nbRHS );
h.zero();
return h;
}
Block<S> get_R()
{
auto r = _HR_block.sub_block( _nb_vect, 0, _nbRHS, _nbRHS );
r.zero();
return r;
}
Block<S> get_H1new()
{
FABULOUS_THROW(Unsupported, "HessChamQR not compatible with DeflatedRestarting");
return Block<S>{};
}
void notify_ortho_end()
{
const int KT = _nb_block_col;
FABULOUS_ASSERT( _nb_vect == KT * _nbRHS );
// FABULOUS_ASSERT( _la_tmp.get_nb_col() == _nbRHS );
// FABULOUS_ASSERT( _la_tmp.get_nb_row() == _nb_vect+_nbRHS );
#ifdef FABULOUS_DEBUG_MODE
const int N = _nbRHS;
const int M = _nb_vect + _nbRHS;
const int J = _nb_vect - N;
Block<S> HR_orig = _orig.sub_block( 0, J, M, N );
HR_orig.copy(_HR_block);
#endif
while (_HR_acquired > 0) {
MORSE_Desc_Release( _HR.get() );
-- _HR_acquired;
}
MorseDesc<S> HR_SLOT = get_sub(_hess, 0, KT-1, KT+1, 1);
MorseDesc<S> HR_from = get_sub(_HR, 0, 0, KT+1, 1);
// equivalent to MORSE_Lapack_to_Tile without extra descriptor creation
// ( because this would 'consume' a mpi tag )
chameleon::lacpy<S>(MorseUpperLower, HR_from.get(), HR_SLOT.get(), _seq.get());
}
Block<S> alloc_least_square_sol()
{
MORSE_Desc_Acquire( _Y.get() );
++ _Y_acquired;
return _Y_block.sub_block(0, 0, _nb_vect, _nbRHS);