Commit 3b13f0f7 authored by Rayan Chikhi's avatar Rayan Chikhi
Browse files

updated boophf to bbhash!

parent 912b3fa8
// BooPHF library
// intended to be a minimal perfect hash function with fast and low memory construction, at the cost of higher bits/elem than other state of the art libraries once built.
// should work with arbitray large number of elements, based on a cascade of bloom filters
// intended to be a minimal perfect hash function with fast and low memory construction, at the cost of (slightly) higher bits/elem than other state of the art libraries once built.
// should work with arbitray large number of elements, based on a cascade of "collision-free" bit arrays
#pragma once
#include <stdio.h>
......@@ -194,8 +194,7 @@ namespace boomphf {
Progress () : timer_mode(0) {}
//include timer, to print ETA ?
};
////////////////////////////////////////////////////////////////
......@@ -203,7 +202,8 @@ namespace boomphf {
#pragma mark hasher
////////////////////////////////////////////////////////////////
typedef std::array<uint64_t,7> hash_set_t;
typedef std::array<uint64_t,10> hash_set_t;
typedef std::array<uint64_t,2> hash_pair_t;
......@@ -232,7 +232,7 @@ namespace boomphf {
{
hash_set_t hset;
for(size_t ii=0;ii<7; ii++)
for(size_t ii=0;ii<10; ii++)
{
hset[ii] = hash64 (key, _seed_tab[ii]);
}
......@@ -291,52 +291,10 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
private:
HashFunctors<Item> hashFunctors;
};
// the SingleHasher_t must have operator()(elem_t key, uint64_t seed)
//this class simply generates a list of seeds
template <typename Item, class SingleHasher_t> class IndepHashFunctors
{
public:
IndepHashFunctors ()
{
generate_hash_seed ();
}
hash_set_t operator () (const Item& key)
{
hash_set_t hset;
for(size_t ii=0;ii<7; ii++)
{
hset[ii] = singleHasher (key, _seed_tab[ii]);
}
return hset;
}
private:
void generate_hash_seed ()
{
static const uint64_t rbase[MAXNBFUNC] =
{
0xAAAAAAAA55555555ULL, 0x33333333CCCCCCCCULL, 0x6666666699999999ULL, 0xB5B5B5B54B4B4B4BULL,
0xAA55AA5555335533ULL, 0x33CC33CCCC66CC66ULL, 0x6699669999B599B5ULL, 0xB54BB54B4BAA4BAAULL,
0xAA33AA3355CC55CCULL, 0x33663366CC99CC99ULL
};
for (size_t i=0; i<MAXNBFUNC; ++i) { _seed_tab[i] = rbase[i]; }
for (size_t i=0; i<MAXNBFUNC; ++i) { _seed_tab[i] = _seed_tab[i] * _seed_tab[(i+3) % MAXNBFUNC] ; }
}
static const size_t MAXNBFUNC = 10;
uint64_t _seed_tab[MAXNBFUNC];
SingleHasher_t singleHasher;
};
template <typename Item, class SingleHasher_t> class XorshiftHashFunctors
{
/* Xorshift128*
......@@ -356,6 +314,8 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
a nonzero 64-bit seed, we suggest to pass it twice through
MurmurHash3's avalanching function. */
// uint64_t s[ 2 ];
uint64_t next(uint64_t * s) {
uint64_t s1 = s[ 0 ];
const uint64_t s0 = s[ 1 ];
......@@ -365,7 +325,31 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
public:
//this one returns all the 7 hashes
uint64_t h0(hash_pair_t & s, const Item& key )
{
s[0] = singleHasher (key, 0xAAAAAAAA55555555ULL);
return s[0];
}
uint64_t h1(hash_pair_t & s, const Item& key )
{
s[1] = singleHasher (key, 0x33333333CCCCCCCCULL);
return s[1];
}
//return next hash an update state s
uint64_t next(hash_pair_t & s ) {
uint64_t s1 = s[ 0 ];
const uint64_t s0 = s[ 1 ];
s[ 0 ] = s0;
s1 ^= s1 << 23; // a
return ( s[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) + s0; // b, c
}
//this one returns all the hashes
hash_set_t operator () (const Item& key)
{
uint64_t s[ 2 ];
......@@ -378,7 +362,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
s[0] = hset[0];
s[1] = hset[1];
for(size_t ii=2;ii< 7 /* it's much better have a constant here, for inlining; this loop is super performance critical*/; ii++)
for(size_t ii=2;ii< 10 /* it's much better have a constant here, for inlining; this loop is super performance critical*/; ii++)
{
hset[ii] = next(s);
}
......@@ -428,7 +412,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
public:
bitVector() : _size(0)
{}
{
_bitArray = NULL;
}
bitVector(uint64_t n) : _size(n)
{
......@@ -462,6 +448,21 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
memset(_bitArray,0,_nchar*sizeof(uint64_t));
}
//clear collisions in interval, only works with start and size multiple of 64
void clearCollisions(uint64_t start, size_t size, bitVector * cc)
{
assert( (start & 63) ==0);
assert( (size & 63) ==0);
uint64_t ids = (start/64ULL);
for(uint64_t ii =0; ii< (size/64ULL); ii++ )
{
_bitArray[ids+ii] = _bitArray[ids+ii] & (~ (cc->get64(ii)) );
}
cc->clear();
}
//clear interval, only works with start and size multiple of 64
void clear(uint64_t start, size_t size)
{
......@@ -506,6 +507,16 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
uint64_t get(uint64_t pos) const
{
return (*this)[pos];
}
uint64_t get64(uint64_t cell64) const
{
return _bitArray[cell64];
}
//set bit pos to 1
void set(uint64_t pos)
{
......@@ -523,18 +534,22 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
void build_ranks()
//return value of last rank
// add offset to all ranks computed
uint64_t build_ranks(uint64_t offset =0)
{
_ranks.reserve(2+ _size/_nb_bits_per_rank_sample);
uint64_t curent_rank = 0;
uint64_t curent_rank = offset;
for (size_t ii = 0; ii < _nchar; ii++) {
if (((ii*64) % _nb_bits_per_rank_sample) == 0) {
_ranks.push_back(curent_rank);
}
curent_rank += popcount_64(_bitArray[ii]);
}
return curent_rank;
}
uint64_t rank(uint64_t pos) const
......@@ -552,97 +567,42 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
return r;
}
protected:
uint64_t* _bitArray;
uint64_t _size;
uint64_t _nchar;
// epsilon = 64 / _nb_bits_per_rank_sample bits
// additional size for rank is epsilon * _size
static const uint64_t _nb_bits_per_rank_sample = 512; //512 seems ok
std::vector<uint64_t> _ranks;
};
////////////////////////////////////////////////////////////////
#pragma mark -
#pragma mark bloom
////////////////////////////////////////////////////////////////
//simple blocked bloom class, does not compute hashes internally, must be given the N hashes as a parameter
//thus does not need to be templated
//(allows reuse of hashes for other things externally)
static u_int8_t bit_mask [] = {
0x01, //00000001
0x02, //00000010
0x04, //00000100
0x08, //00001000
0x10, //00010000
0x20, //00100000
0x40, //01000000
0x80 //10000000
};
//blocked bloom
class bbloom
{
public:
bbloom (uint64_t tai_bloom, size_t nbHash = 7, size_t block_nbits = 12)
: _n_hash_func(nbHash), _blooma(0), _tai(tai_bloom+2*(1<<block_nbits)), _nchar(0), _nbits_BlockSize(block_nbits)
{
_nchar = (1ULL+_tai/8ULL);
_blooma = (unsigned char *) malloc (_nchar*sizeof(unsigned char));
memset (_blooma, 0, _nchar*sizeof(unsigned char));
_mask_block = (1ULL<<_nbits_BlockSize) - 1ULL;
_reduced_tai = _tai - 2ULL*(1ULL<<_nbits_BlockSize) ;//2* for neighbor coherent
}
virtual ~bbloom () { free (_blooma); }
size_t getNbHash () const { return _n_hash_func; }
uint64_t bitSize() const {return _nchar*8;}
bool contains (hash_set_t & hashes)
void save(std::ostream& os) const
{
uint64_t h0 = hashes[0] % _reduced_tai;
if ((_blooma[h0 >> 3ULL ] & bit_mask[h0 & 7]) == 0 ) { return false; }
for (size_t i=1; i<_n_hash_func; i++)
{
uint64_t h1 = h0 + (hashes[i] & _mask_block ) ;
if ((_blooma[h1 >> 3ULL ] & bit_mask[h1 & 7]) == 0) { return false; }
}
return true;
os.write(reinterpret_cast<char const*>(&_size), sizeof(_size));
os.write(reinterpret_cast<char const*>(&_nchar), sizeof(_nchar));
os.write(reinterpret_cast<char const*>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
size_t sizer = _ranks.size();
os.write(reinterpret_cast<char const*>(&sizer), sizeof(size_t));
os.write(reinterpret_cast<char const*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
}
void insert (hash_set_t & hashes)
void load(std::istream& is)
{
uint64_t h0 = hashes[0] % _reduced_tai;
__sync_fetch_and_or (_blooma + (h0 >> 3ULL), bit_mask[h0 & 7]);
for (size_t i=1; i< _n_hash_func; i++)
{
uint64_t h1 = h0 + (hashes[i] & _mask_block ) ;
__sync_fetch_and_or (_blooma + (h1 >> 3ULL), bit_mask[h1 & 7]);
}
is.read(reinterpret_cast<char*>(&_size), sizeof(_size));
is.read(reinterpret_cast<char*>(&_nchar), sizeof(_nchar));
this->resize(_size);
is.read(reinterpret_cast<char *>(_bitArray), (std::streamsize)(sizeof(uint64_t) * _nchar));
size_t sizer;
is.read(reinterpret_cast<char *>(&sizer), sizeof(size_t));
_ranks.resize(sizer);
is.read(reinterpret_cast<char*>(_ranks.data()), (std::streamsize)(sizeof(_ranks[0]) * _ranks.size()));
}
private:
size_t _n_hash_func;
u_int8_t* _blooma;
uint64_t _tai;
protected:
uint64_t* _bitArray;
uint64_t _size;
uint64_t _nchar;
uint64_t _mask_block;
size_t _nbits_BlockSize;
uint64_t _reduced_tai;
// epsilon = 64 / _nb_bits_per_rank_sample bits
// additional size for rank is epsilon * _size
static const uint64_t _nb_bits_per_rank_sample = 512; //512 seems ok
std::vector<uint64_t> _ranks;
};
////////////////////////////////////////////////////////////////
#pragma mark -
#pragma mark level
......@@ -652,11 +612,20 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
public:
level(){ }
~level() { delete bloom;}
~level() {
delete bitset;
}
uint64_t idx_begin;
uint64_t hash_domain;
bbloom * bloom;
bitVector * bitset;
uint64_t get(uint64_t hash_raw)
{
uint64_t hashi = hash_raw % hash_domain;
return bitset->get(hashi);
}
};
......@@ -680,8 +649,6 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
};
//forward declaration
template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
void * thread_initLevel0(void * args);
template <typename elem_t, typename Hasher_t, typename Range, typename it_type>
void * thread_processLevel(void * args);
......@@ -692,7 +659,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
template <typename elem_t, typename Hasher_t>
class mphf {
/* this mechanisms gets 7 hashes (for the Bloom filters) out of Hasher_t */
/* this mechanisms gets P hashes out of Hasher_t */
typedef XorshiftHashFunctors<elem_t,Hasher_t> MultiHasher_t ;
// typedef HashFunctors<elem_t> MultiHasher_t; // original code (but only works for int64 keys) (seems to be as fast as the current xorshift)
//typedef IndepHashFunctors<elem_t,Hasher_t> MultiHasher_t; //faster than xorshift
......@@ -701,6 +668,7 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
mphf()
{}
~mphf()
{
pthread_mutex_destroy(&_mutex);
......@@ -709,40 +677,48 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
{
delete _levels[ii];
}
delete _bitset;
}
// allow perc_elem_loaded elements to be loaded in ram for faster construction (default 3%), set to 0 to desactivate
template <typename Range>
mphf( size_t n, Range const& input_range,int num_thread = 1, bool fastmode = true, double gamma = 2.5) :
_gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _fastmode (fastmode)
mphf( size_t n, Range const& input_range,int num_thread = 1, double gamma = 2.0 ,float perc_elem_loaded = 0.03) :
_gamma(gamma), _hash_domain(size_t(ceil(double(n) * gamma))), _nelem(n), _num_thread(num_thread), _percent_elem_loaded_for_fastMode (perc_elem_loaded)
{
if(_percent_elem_loaded_for_fastMode > 0.0 )
_fastmode =true;
setup();
_progressBar.timer_mode=1;
if(_fastmode)
_progressBar.init( _nelem * 3 + ( _nelem * _proba_collision * _proba_collision) * (_nb_levels-3) ,"Building BooPHF");
_progressBar.init( _nelem * (_fastModeLevel+1) + ( _nelem * pow(_proba_collision,_fastModeLevel)) * (_nb_levels-(_fastModeLevel+1)) ,"Building BooPHF");
else
_progressBar.init( _nelem * _nb_levels ,"Building BooPHF");
initLevel0(input_range);
for(int ii = 0; ii< _nb_levels-1; ii++)
uint64_t offset = 0;
for(int ii = 0; ii< _nb_levels; ii++)
{
_tempBitset = new bitVector(_levels[ii]->hash_domain); // temp collision bitarray for this level
processLevel(input_range,ii);
_levels[ii]->bitset->clearCollisions(0 , _levels[ii]->hash_domain , _tempBitset);
offset = _levels[ii]->bitset->build_ranks(offset);
delete _tempBitset;
}
_progressBar.finish_threaded();
_bitset->build_ranks();
_lastbitsetrank = offset ;
_lastbitsetrank = _bitset->rank( _bitset->size() -1);
//printf("used temp ram for construction : %lli MB \n",setLevelFastmode.capacity()* sizeof(elem_t) /1024ULL/1024ULL);
//printf("used temp ram for construction : %lli MB \n",setLevel2.capacity()* sizeof(elem_t) /1024ULL/1024ULL);
std::vector<elem_t>().swap(setLevel2); // clear setLevel2 reallocating
std::vector<elem_t>().swap(setLevelFastmode); // clear setLevelFastmode reallocating
}
......@@ -752,8 +728,9 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
auto hashes = _hasher(elem);
uint64_t non_minimal_hp,minimal_hp;
int level = getLevel(hashes);
hash_pair_t bbhash; int level;
uint64_t level_hash = getLevel(bbhash,elem,&level);
if( level == (_nb_levels-1))
{
......@@ -762,10 +739,10 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
}
else
{
non_minimal_hp = _levels[level]->idx_begin + ( hashes[level] % _levels[level]->hash_domain);
non_minimal_hp = level_hash % _levels[level]->hash_domain; // in fact non minimal hp would be + _levels[level]->idx_begin
}
minimal_hp = _bitset->rank(non_minimal_hp);
minimal_hp = _levels[level]->bitset->rank(non_minimal_hp );
return minimal_hp;
}
......@@ -777,72 +754,25 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
uint64_t totalBitSize()
{
uint64_t bloomsizes = 0;
for (int ii=0; ii< _nb_levels-1; ii++)
uint64_t totalsizeBitset = 0;
for(int ii=0; ii<_nb_levels; ii++)
{
bloomsizes+= _levels[ii]->bloom->bitSize();
totalsizeBitset += _levels[ii]->bitset->bitSize();
}
uint64_t totalsize = _bitset->bitSize() + bloomsizes + _final_hash.size()*42*8 ; // unordered map takes approx 42B per elem [personal test] (42B with uint64_t key, would be larger for other type of elem)
uint64_t totalsize = totalsizeBitset + _final_hash.size()*42*8 ; // unordered map takes approx 42B per elem [personal test] (42B with uint64_t key, would be larger for other type of elem)
printf("Bitarray %12llu bits (%.2f %%) (array + ranks )\n",
_bitset->bitSize(), 100*(float)_bitset->bitSize()/totalsize);
printf("Blooms %12llu bits (%.2f %%)\n",
bloomsizes, 100*(float)bloomsizes/totalsize);
totalsizeBitset, 100*(float)totalsizeBitset/totalsize);
printf("final hash %12lu bits (%.2f %%) (nb in final hash %lu)\n",
_final_hash.size()*42*8, 100*(float)(_final_hash.size()*42*8)/totalsize,
_final_hash.size() );
return totalsize;
}
template <typename Range,typename Iterator>
void pthread_init0(elem_t * buffer, Range input_range, std::shared_ptr<Iterator> shared_it, std::shared_ptr<Iterator> until_p)
{
uint64_t nb_done =0;
int tid = __sync_fetch_and_add (&_nb_living, 1);
auto until = *until_p;
uint64_t inbuff =0;
for (bool isRunning=true; isRunning ; )
{
pthread_mutex_lock(&_mutex);
//copy n items into buffer
for(; inbuff<NBBUFF && (*shared_it)!=until; ++(*shared_it) /* subtle: if it was sahared++, we would need to implenent operator++(int) for our iterator */)
{
buffer[inbuff]= *(*shared_it); inbuff++;
}
if((*shared_it)==until) isRunning =false;
pthread_mutex_unlock(&_mutex);
//do work on the n elems of the buffer
for(uint64_t ii=0; ii<inbuff ; ii++)
{
elem_t val = buffer[ii];
auto hashes = _hasher(val);
insertIntoLevel(hashes,0);
nb_done++;
if((nb_done&1023) ==0 ) {_progressBar.inc(nb_done,tid);nb_done=0; }
}
inbuff = 0;
}
}
template <typename Range,typename Iterator>
void pthread_processLevel(elem_t * buffer, Range input_range, std::shared_ptr<Iterator> shared_it, std::shared_ptr<Iterator> until_p, int i)
template <typename Iterator> //typename Range,
void pthread_processLevel(elem_t * buffer, std::shared_ptr<Iterator> shared_it, std::shared_ptr<Iterator> until_p, int i)
{
uint64_t nb_done =0;
int tid = __sync_fetch_and_add (&_nb_living, 1);
......@@ -868,44 +798,53 @@ we need this 2-functors scheme because HashFunctors won't work with unordered_ma
{
elem_t val = buffer[ii];
auto hashes = _hasher(val);
int level = getLevel(hashes, i+1); //should be safe
//auto hashes = _hasher(val);
hash_pair_t bbhash; int level;
uint64_t level_hash = getLevel(bbhash,val,&level, i);
if(level == i+1)
if(level == i) //insert into lvl i
{
if(i == 1 && _fastmode)
__sync_fetch_and_add(& _cptLevel,1);
if(i == _fastModeLevel && _fastmode)
{
uint64_t idxl2 = __sync_fetch_and_add(& _idxLevelSetLevel2,1);
//si depasse taille attendue pour setLevel2, fall back sur slow mode mais devrait pas arriver si hash ok et proba avec nous
if(idxl2> setLevel2.size())
uint64_t idxl2 = __sync_fetch_and_add(& _idxLevelsetLevelFastmode,1);
//si depasse taille attendue pour setLevelFastmode, fall back sur slow mode mais devrait pas arriver si hash ok et proba avec nous
if(idxl2> setLevelFastmode.size())
_fastmode = false;
else
setLevel2[idxl2] = val; // create set E2
setLevelFastmode[idxl2] = val; // create set for fast mode
}
//insert to level i+1 : either next level of the cascade or final hash if last level reached
if(i+1== _nb_levels-1) //stop cascade here, insert into exact hash
if(i == _nb_levels-1) //stop cascade here, insert into exact hash
{
uint64_t hashidx = __sync_fetch_and_add (& _hashidx, 1);
pthread_mutex_lock(&_mutex); //see later if possible to avoid this, mais pas bcp item vont la
//_bitset->set(hashidx);
// calc rank de fin precedent level qq part, puis init hashidx avec ce rank, direct minimal, pas besoin inser ds bitset et rank
_final_hash[val] = hashidx;
pthread_mutex_unlock(&_mutex);
}
else
{
insertIntoLevel(hashes,i+1); //should be safe
//computes next hash
if ( level == 0)
level_hash = _hasher.h0(bbhash,val);
else if ( level == 1)
level_hash = _hasher.h1(bbhash,val);
else
{
level_hash = _hasher.next(bbhash);
}
insertIntoLevel(level_hash,i); //should be safe