Commit ce734dd8 authored by Guillaume Rizk's avatar Guillaume Rizk
Browse files

now use Hash16 as a fallback plan for dsk fillsolid

parent 1e5bb969
......@@ -19,6 +19,7 @@
#include <gatb/kmer/impl/PartitionsCommand.hpp>
#include <gatb/tools/collections/impl/OAHash.hpp>
#include <gatb/tools/collections/impl/Hash16.hpp>
using namespace std;
......@@ -170,8 +171,10 @@ void PartitionsByHashCommand<span>:: execute ()
size_t count=0;
/** We need a map for storing part of solid kmers. */
OAHash<Type> hash (_hashMemory); //or use hash16 to ensure always finishes ?
//OAHash<Type> hash (_hashMemory);
Hash16<Type> hash16 (_hashMemory/MBYTE); // now use hash 16 to ensure always finish
/** We directly fill the vector from the current partition file. */
Iterator<Type>* it = this->_partition.iterator(); LOCAL(it);
......@@ -215,8 +218,9 @@ void PartitionsByHashCommand<span>:: execute ()
mink = std::min (rev_temp, temp);
/** We insert the kmer into the hash. */
hash.increment (mink);
//hash.increment (mink);
hash16.insert(mink);
if(rem < 2) break;
newnt = ( superk >> ( 2*(rem-2)) ) & 3 ;
......@@ -228,19 +232,31 @@ void PartitionsByHashCommand<span>:: execute ()
/** We loop over the solid kmers map.
* NOTE !!! we want the items to be sorted by kmer values (see finalize part of debloom). */
Iterator < Abundance<Type> >* itKmerAbundance = hash.iterator(true);
//Iterator < Abundance<Type> >* itKmerAbundance = hash.iterator(true);
//shortcut
typedef typename tools::collections::impl::Hash16<Type>::cell cell_t;
Iterator < cell_t >* itKmerAbundance = hash16.iterator(true);
LOCAL (itKmerAbundance);
for (itKmerAbundance->first(); !itKmerAbundance->isDone(); itKmerAbundance->next())
{
/** Shortcut. */
Abundance<Type>& current = itKmerAbundance->item();
// Abundance<Type>& current = itKmerAbundance->item();
/** We update the solid counter. */
solidCounter.set (current.getAbundance());
//solidCounter.set (current.getAbundance());
/** We may add this kmer to the solid kmers bag. */
this->insert (current.getValue(), solidCounter);
//this->insert (current.getValue(), solidCounter);
cell_t & cell = itKmerAbundance->item();
solidCounter.set (cell.val);
this->insert (cell.graine, solidCounter);
}
this->_progress->inc (this->_pInfo.getNbKmer(this->_parti_num) ); // this->_pInfo->getNbKmer(this->_parti_num) kmers.size()
......@@ -521,8 +537,9 @@ void PartitionsByVectorCommand<span>::executeRead ()
* <------------------------->
* current partition content
*/
DEBUG (("_offsets.size=%d OFFSETS: ", _nbItemsPerBankPerPart.size() ));
for (size_t j=0; j<_nbItemsPerBankPerPart.size(); j++) { DEBUG (("%6d ", _nbItemsPerBankPerPart[j])); } DEBUG (("\n"));
DEBUG (("_offsets.size=%d OFFSETS: ", _nbItemsPerBankPerPart.size() ));
for (size_t j=0; j<_nbItemsPerBankPerPart.size(); j++) { DEBUG (("%6d ", _nbItemsPerBankPerPart[j])); } DEBUG (("\n"));
uint64_t sum_nbxmer =0;
......
......@@ -49,15 +49,26 @@ namespace impl {
*/
template <typename Item, typename value_type=int> class Hash16
{
public :
//shortcut
typedef misc::impl::cell_ptr_t cell_ptr_t;
typedef struct
{
Item graine;
cell_ptr_t suiv;
value_type val;
} cell;
protected:
/** Shortcuts. */
typedef typename misc::impl::Pool<Item,value_type>::cell cell;
typedef typename misc::impl::Pool<Item,value_type>::cell_ptr_t cell_ptr_t;
cell_ptr_t * datah;
misc::impl::Pool<Item,value_type> storage;
misc::impl::Pool<cell> storage; // was Item,value_type
u_int64_t mask ;
u_int64_t tai;
......@@ -69,8 +80,10 @@ protected:
public:
/** Constructor.
* \param[in] sizeMB : max memory to be used by the hash table
* \param[in] sizeMB : approx max memory to be used by the hash table
*/
Hash16 (size_t sizeMB) : datah(0), mask(0), tai(0), nb_elem(0), max_nb_elem(0), _memory(system::impl::System::memory())
{
......@@ -91,6 +104,8 @@ public:
datah = (cell_ptr_t *) _memory.calloc( tai , sizeof(cell_ptr_t)); //create hashtable
//printf("Hash16 size asked in MB %zu tai_Hash16 %i nb entries %llu \n",sizeMB,tai_Hash16,tai);
cell pcell;
//printf("Hash 16 cell %lli graine %i suiv %i val %i\n",sizeof(cell),sizeof(pcell.graine),sizeof(pcell.suiv),sizeof(pcell.val));
_memory.memset (datah,0, tai * sizeof(cell_ptr_t));
}
......@@ -210,81 +225,26 @@ public:
}
static bool sortByKey(const cell &lhs, const cell &rhs) { return lhs.graine < rhs.graine; }
/** Get an iterator for the hash table.
* \param[in] sorted : if true, items are iterated in a sorted way
* \param[in] sorted : if true, items are iterated in a sorted way (warning: reorder in place so cant acces hash after that !)
* \return an iterator over the items of the hash table.
*/
dp::Iterator < std::pair<Item,value_type> >* iterator ()
{
return new Iterator(*this);
}
/************************************************************/
//avec std::pair ? pour avoir Item, value_type
class Iterator : public tools::dp::Iterator < std::pair<Item,value_type> >
//dp::Iterator < std::pair<Item,value_type> >* iterator (bool sorted=false)
//just get the underlying pool iterator which is simply iteration over multiple arrays, no need to traverse linked list
dp::Iterator < cell >* iterator (bool sorted=false)
{
public:
Iterator (Hash16<Item,value_type>& aRef) : ref(aRef), iterator(0), iteratorMax(0), done(true) {}
/** \copydoc tools::dp::Iterator::first */
void first()
if(sorted)
{
iterator = ref.datah - 1;
iteratorMax = ref.datah + ref.tai;
cell_ptr = NULL;
done = false;
next ();
return storage.iteratorsorted(sortByKey);
}
/** \copydoc tools::dp::Iterator::next */
void next()
else
{
if(cell_ptr == NULL || ref.storage.internal_ptr_to_cell_pointer(cell_ptr->suiv)==NULL ) // au bout de liste
{
//go to next non null entry
while (!done)
{
iterator++;
done = (iterator >= iteratorMax);
cell_ptr = ref.storage.internal_ptr_to_cell_pointer(*iterator);
if(!done && cell_ptr!= NULL)
{
*this->_item = std::pair<Item,value_type> (cell_ptr->graine,cell_ptr->val);
break;
}
}
}
else // we are not at end of list, so only advance within list
{
cell_ptr = ref.storage.internal_ptr_to_cell_pointer(cell_ptr->suiv);
*this->_item = std::pair<Item,value_type> (cell_ptr->graine,cell_ptr->val);
done = false;
}
//should be ok
return storage.iterator();
}
/** \copydoc tools::dp::Iterator::isDone */
bool isDone () { return done; }
/** \copydoc tools::dp::Iterator::item */
std::pair<Item,value_type>& item () { return *this->_item; }
private:
Hash16<Item,value_type>& ref;
cell_ptr_t * iterator;
cell_ptr_t * iteratorMax;
cell* cell_ptr;
bool done;
};
}
/** Get the value for a given key
* \param[in] graine : key
......
......@@ -29,6 +29,7 @@
/********************************************************************************/
#include <gatb/system/impl/System.hpp>
#include <queue> // std::priority_queue
/********************************************************************************/
namespace gatb {
......@@ -40,20 +41,19 @@ namespace impl {
/* Cette class dÈfinit une pool memoire pour allocation rapide de la table de hachage
* utilisee quand seed >14 */
template <typename graine_type, typename value_type=int> class Pool
//now pass as template the cell type
//had to move this outside the class (otherwise recursive def of cell template that may contain cell_ptr_t ... )
typedef u_int32_t cell_ptr_t;
template <typename cell> class Pool
//template <typename graine_type, typename value_type=int> class Pool
{
public:
typedef u_int32_t cell_ptr_t;
struct cell
{
graine_type graine;
cell_ptr_t suiv;
value_type val;
};
/** Default constructor.
/** Default constructor.
* \param[in] tai : 2^22 16 M cells *16 o blocs de 256 Mo
* \param[in] N : 2^10 soit 4 G cells max
* */
......@@ -135,6 +135,179 @@ public:
n_pools=2;
}
//sort the pools according to some comparator
//warning this will reorder cells and thus making existing pointers to cells irrelevant
//but useful for e.g. sirted iterator of cells
template <typename Comparator>
void sortPools(Comparator comparator)
{
// les pool pleines
for(size_t i=1;i<(n_pools-1);i++)
{
std::sort( tab_pool[i], tab_pool[i] + TAI_POOL, comparator);
}
// la pool en cours de remplissage
std::sort( tab_pool[n_pools-1], tab_pool[n_pools-1] + n_cells, comparator);
}
////////simple iterator over all cells
template <typename Comparator>
dp::Iterator < cell >* iteratorsorted (Comparator comparator)
{
//first sort each pool with std sort
this->sortPools(comparator);
//then iterate with a merge sort
return new IteratorSorted(*this);
}
//todo template also this with a comparator
class IteratorSorted : public tools::dp::Iterator < cell >
{
public:
typedef std::pair<int, cell *> cellpair_t; //id pointer of pool , cell *
struct sortcellpair { bool operator() (cellpair_t &l,cellpair_t &r) { return ( (* l.second).val < (* r.second).val ); } } ;
IteratorSorted (Pool<cell>& aRef) : ref(aRef), done(true) {}
/** \copydoc tools::dp::Iterator::first */
void first()
{
for(size_t i=1;i< ref.n_pools;i++)
{
cellpair_t newwcp= cellpair_t(i,&(ref.tab_pool[i][0]) );
pq.push( cellpair_t(i, (cell *) &(ref.tab_pool[i][0]) ) );
}
next();
}
/** \copydoc tools::dp::Iterator::next */
void next()
{
done = (pq.size() == 0);
if(!done)
{
cellpair_t current_pair = pq.top() ; pq.pop();
*this->_item = * (current_pair.second);
//push the next cell of this list if any
int cell_number = current_pair.second - ref.tab_pool[current_pair.first] ;
int current_pool = current_pair.first;
if( (current_pool < (ref.n_pools -1)) && ((cell_number+1) < ref.TAI_POOL) ) // inside a full pool, and cells remaining
{
pq.push( cellpair_t(current_pool, & (ref.tab_pool[current_pool][cell_number+1]) ) );
}
else if ( (current_pool == (ref.n_pools -1)) && ((cell_number+1) < ref.n_cells) ) // inside last pool, and cells remaining
{
pq.push( cellpair_t(current_pool, & (ref.tab_pool[current_pool][cell_number+1]) ) );
}
//otherwise at end of array, dont push anything
}
}
/** \copydoc tools::dp::Iterator::isDone */
bool isDone () { return done; }
/** \copydoc tools::dp::Iterator::item */
cell& item () { return *this->_item; }
private:
std::priority_queue< cellpair_t, std::vector<cellpair_t>, sortcellpair > pq;
Pool<cell>& ref;
bool done;
};
//////
////////simple iterator over all cells
dp::Iterator < cell >* iterator ()
{
return new Iterator(*this);
}
/************************************************************/
//avec std::pair ? pour avoir Item, value_type
class Iterator : public tools::dp::Iterator < cell >
{
public:
Iterator (Pool<cell>& aRef) : ref(aRef), done(true) {}
/** \copydoc tools::dp::Iterator::first */
void first()
{
_current_pool = 1; // first pool
_current_cell = 0;
done = ref.n_cells < 1;
if(!done)
*this->_item = ref.tab_pool[_current_pool][_current_cell];
_current_cell++; // next cell that should be read
}
/** \copydoc tools::dp::Iterator::next */
void next()
{
if(_current_pool < (ref.n_pools -1) && _current_cell < ref.TAI_POOL ) // inside a full pool, and cells remaining
{
*this->_item = ref.tab_pool[_current_pool][_current_cell];
_current_cell++;
return;
}
else if (_current_pool < (ref.n_pools -1) && _current_cell == ref.TAI_POOL ) // inside a full pool but no cells remaining
{
//go to next pool
_current_pool++;
_current_cell = 0;
*this->_item = ref.tab_pool[_current_pool][_current_cell];
_current_cell++;
}
else if (_current_pool == (ref.n_pools -1) && _current_cell < ref.n_cells) // in last pool and cells remaining
{
*this->_item = ref.tab_pool[_current_pool][_current_cell];
_current_cell++;
}
else // last pools and no cells left, done
{
done = true;
}
}
/** \copydoc tools::dp::Iterator::isDone */
bool isDone () { return done; }
/** \copydoc tools::dp::Iterator::item */
cell& item () { return *this->_item; }
private:
unsigned int _current_pool;
unsigned int _current_cell;
Pool<cell>& ref;
bool done;
};
private:
/** table de cell, pour usage courant */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment