Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
GATB
gatb-core
Commits
ce734dd8
Commit
ce734dd8
authored
Feb 12, 2016
by
Guillaume Rizk
Browse files
now use Hash16 as a fallback plan for dsk fillsolid
parent
1e5bb969
Changes
3
Hide whitespace changes
Inline
Side-by-side
gatb-core/src/gatb/kmer/impl/PartitionsCommand.cpp
View file @
ce734dd8
...
...
@@ -19,6 +19,7 @@
#include <gatb/kmer/impl/PartitionsCommand.hpp>
#include <gatb/tools/collections/impl/OAHash.hpp>
#include <gatb/tools/collections/impl/Hash16.hpp>
using
namespace
std
;
...
...
@@ -170,8 +171,10 @@ void PartitionsByHashCommand<span>:: execute ()
size_t
count
=
0
;
/** We need a map for storing part of solid kmers. */
OAHash
<
Type
>
hash
(
_hashMemory
);
//or use hash16 to ensure always finishes ?
//
OAHash<Type> hash (_hashMemory);
Hash16
<
Type
>
hash16
(
_hashMemory
/
MBYTE
);
// now use hash 16 to ensure always finish
/** We directly fill the vector from the current partition file. */
Iterator
<
Type
>*
it
=
this
->
_partition
.
iterator
();
LOCAL
(
it
);
...
...
@@ -215,8 +218,9 @@ void PartitionsByHashCommand<span>:: execute ()
mink
=
std
::
min
(
rev_temp
,
temp
);
/** We insert the kmer into the hash. */
hash
.
increment
(
mink
);
//hash.increment (mink);
hash16
.
insert
(
mink
);
if
(
rem
<
2
)
break
;
newnt
=
(
superk
>>
(
2
*
(
rem
-
2
))
)
&
3
;
...
...
@@ -228,19 +232,31 @@ void PartitionsByHashCommand<span>:: execute ()
/** We loop over the solid kmers map.
* NOTE !!! we want the items to be sorted by kmer values (see finalize part of debloom). */
Iterator
<
Abundance
<
Type
>
>*
itKmerAbundance
=
hash
.
iterator
(
true
);
//Iterator < Abundance<Type> >* itKmerAbundance = hash.iterator(true);
//shortcut
typedef
typename
tools
::
collections
::
impl
::
Hash16
<
Type
>::
cell
cell_t
;
Iterator
<
cell_t
>*
itKmerAbundance
=
hash16
.
iterator
(
true
);
LOCAL
(
itKmerAbundance
);
for
(
itKmerAbundance
->
first
();
!
itKmerAbundance
->
isDone
();
itKmerAbundance
->
next
())
{
/** Shortcut. */
Abundance
<
Type
>&
current
=
itKmerAbundance
->
item
();
//
Abundance<Type>& current = itKmerAbundance->item();
/** We update the solid counter. */
solidCounter
.
set
(
current
.
getAbundance
());
//
solidCounter.set (current.getAbundance());
/** We may add this kmer to the solid kmers bag. */
this
->
insert
(
current
.
getValue
(),
solidCounter
);
//this->insert (current.getValue(), solidCounter);
cell_t
&
cell
=
itKmerAbundance
->
item
();
solidCounter
.
set
(
cell
.
val
);
this
->
insert
(
cell
.
graine
,
solidCounter
);
}
this
->
_progress
->
inc
(
this
->
_pInfo
.
getNbKmer
(
this
->
_parti_num
)
);
// this->_pInfo->getNbKmer(this->_parti_num) kmers.size()
...
...
@@ -521,8 +537,9 @@ void PartitionsByVectorCommand<span>::executeRead ()
* <------------------------->
* current partition content
*/
DEBUG
((
"_offsets.size=%d OFFSETS: "
,
_nbItemsPerBankPerPart
.
size
()
));
for
(
size_t
j
=
0
;
j
<
_nbItemsPerBankPerPart
.
size
();
j
++
)
{
DEBUG
((
"%6d "
,
_nbItemsPerBankPerPart
[
j
]));
}
DEBUG
((
"
\n
"
));
DEBUG
((
"_offsets.size=%d OFFSETS: "
,
_nbItemsPerBankPerPart
.
size
()
));
for
(
size_t
j
=
0
;
j
<
_nbItemsPerBankPerPart
.
size
();
j
++
)
{
DEBUG
((
"%6d "
,
_nbItemsPerBankPerPart
[
j
]));
}
DEBUG
((
"
\n
"
));
uint64_t
sum_nbxmer
=
0
;
...
...
gatb-core/src/gatb/tools/collections/impl/Hash16.hpp
View file @
ce734dd8
...
...
@@ -49,15 +49,26 @@ namespace impl {
*/
template
<
typename
Item
,
typename
value_type
=
int
>
class
Hash16
{
public
:
//shortcut
typedef
misc
::
impl
::
cell_ptr_t
cell_ptr_t
;
typedef
struct
{
Item
graine
;
cell_ptr_t
suiv
;
value_type
val
;
}
cell
;
protected:
/** Shortcuts. */
typedef
typename
misc
::
impl
::
Pool
<
Item
,
value_type
>::
cell
cell
;
typedef
typename
misc
::
impl
::
Pool
<
Item
,
value_type
>::
cell_ptr_t
cell_ptr_t
;
cell_ptr_t
*
datah
;
misc
::
impl
::
Pool
<
Item
,
value_type
>
storage
;
misc
::
impl
::
Pool
<
cell
>
storage
;
// was
Item,value_type
u_int64_t
mask
;
u_int64_t
tai
;
...
...
@@ -69,8 +80,10 @@ protected:
public:
/** Constructor.
* \param[in] sizeMB : max memory to be used by the hash table
* \param[in] sizeMB :
approx
max memory to be used by the hash table
*/
Hash16
(
size_t
sizeMB
)
:
datah
(
0
),
mask
(
0
),
tai
(
0
),
nb_elem
(
0
),
max_nb_elem
(
0
),
_memory
(
system
::
impl
::
System
::
memory
())
{
...
...
@@ -91,6 +104,8 @@ public:
datah
=
(
cell_ptr_t
*
)
_memory
.
calloc
(
tai
,
sizeof
(
cell_ptr_t
));
//create hashtable
//printf("Hash16 size asked in MB %zu tai_Hash16 %i nb entries %llu \n",sizeMB,tai_Hash16,tai);
cell
pcell
;
//printf("Hash 16 cell %lli graine %i suiv %i val %i\n",sizeof(cell),sizeof(pcell.graine),sizeof(pcell.suiv),sizeof(pcell.val));
_memory
.
memset
(
datah
,
0
,
tai
*
sizeof
(
cell_ptr_t
));
}
...
...
@@ -210,81 +225,26 @@ public:
}
static
bool
sortByKey
(
const
cell
&
lhs
,
const
cell
&
rhs
)
{
return
lhs
.
graine
<
rhs
.
graine
;
}
/** Get an iterator for the hash table.
* \param[in] sorted : if true, items are iterated in a sorted way
* \param[in] sorted : if true, items are iterated in a sorted way
(warning: reorder in place so cant acces hash after that !)
* \return an iterator over the items of the hash table.
*/
dp
::
Iterator
<
std
::
pair
<
Item
,
value_type
>
>*
iterator
()
{
return
new
Iterator
(
*
this
);
}
/************************************************************/
//avec std::pair ? pour avoir Item, value_type
class
Iterator
:
public
tools
::
dp
::
Iterator
<
std
::
pair
<
Item
,
value_type
>
>
//dp::Iterator < std::pair<Item,value_type> >* iterator (bool sorted=false)
//just get the underlying pool iterator which is simply iteration over multiple arrays, no need to traverse linked list
dp
::
Iterator
<
cell
>*
iterator
(
bool
sorted
=
false
)
{
public:
Iterator
(
Hash16
<
Item
,
value_type
>&
aRef
)
:
ref
(
aRef
),
iterator
(
0
),
iteratorMax
(
0
),
done
(
true
)
{}
/** \copydoc tools::dp::Iterator::first */
void
first
()
if
(
sorted
)
{
iterator
=
ref
.
datah
-
1
;
iteratorMax
=
ref
.
datah
+
ref
.
tai
;
cell_ptr
=
NULL
;
done
=
false
;
next
();
return
storage
.
iteratorsorted
(
sortByKey
);
}
/** \copydoc tools::dp::Iterator::next */
void
next
()
else
{
if
(
cell_ptr
==
NULL
||
ref
.
storage
.
internal_ptr_to_cell_pointer
(
cell_ptr
->
suiv
)
==
NULL
)
// au bout de liste
{
//go to next non null entry
while
(
!
done
)
{
iterator
++
;
done
=
(
iterator
>=
iteratorMax
);
cell_ptr
=
ref
.
storage
.
internal_ptr_to_cell_pointer
(
*
iterator
);
if
(
!
done
&&
cell_ptr
!=
NULL
)
{
*
this
->
_item
=
std
::
pair
<
Item
,
value_type
>
(
cell_ptr
->
graine
,
cell_ptr
->
val
);
break
;
}
}
}
else
// we are not at end of list, so only advance within list
{
cell_ptr
=
ref
.
storage
.
internal_ptr_to_cell_pointer
(
cell_ptr
->
suiv
);
*
this
->
_item
=
std
::
pair
<
Item
,
value_type
>
(
cell_ptr
->
graine
,
cell_ptr
->
val
);
done
=
false
;
}
//should be ok
return
storage
.
iterator
();
}
/** \copydoc tools::dp::Iterator::isDone */
bool
isDone
()
{
return
done
;
}
/** \copydoc tools::dp::Iterator::item */
std
::
pair
<
Item
,
value_type
>&
item
()
{
return
*
this
->
_item
;
}
private:
Hash16
<
Item
,
value_type
>&
ref
;
cell_ptr_t
*
iterator
;
cell_ptr_t
*
iteratorMax
;
cell
*
cell_ptr
;
bool
done
;
};
}
/** Get the value for a given key
* \param[in] graine : key
...
...
gatb-core/src/gatb/tools/misc/impl/Pool.hpp
View file @
ce734dd8
...
...
@@ -29,6 +29,7 @@
/********************************************************************************/
#include <gatb/system/impl/System.hpp>
#include <queue> // std::priority_queue
/********************************************************************************/
namespace
gatb
{
...
...
@@ -40,20 +41,19 @@ namespace impl {
/* Cette class dÈfinit une pool memoire pour allocation rapide de la table de hachage
* utilisee quand seed >14 */
template
<
typename
graine_type
,
typename
value_type
=
int
>
class
Pool
//now pass as template the cell type
//had to move this outside the class (otherwise recursive def of cell template that may contain cell_ptr_t ... )
typedef
u_int32_t
cell_ptr_t
;
template
<
typename
cell
>
class
Pool
//template <typename graine_type, typename value_type=int> class Pool
{
public:
typedef
u_int32_t
cell_ptr_t
;
struct
cell
{
graine_type
graine
;
cell_ptr_t
suiv
;
value_type
val
;
};
/** Default constructor.
/** Default constructor.
* \param[in] tai : 2^22 16 M cells *16 o blocs de 256 Mo
* \param[in] N : 2^10 soit 4 G cells max
* */
...
...
@@ -135,6 +135,179 @@ public:
n_pools
=
2
;
}
//sort the pools according to some comparator
//warning this will reorder cells and thus making existing pointers to cells irrelevant
//but useful for e.g. sirted iterator of cells
template
<
typename
Comparator
>
void
sortPools
(
Comparator
comparator
)
{
// les pool pleines
for
(
size_t
i
=
1
;
i
<
(
n_pools
-
1
);
i
++
)
{
std
::
sort
(
tab_pool
[
i
],
tab_pool
[
i
]
+
TAI_POOL
,
comparator
);
}
// la pool en cours de remplissage
std
::
sort
(
tab_pool
[
n_pools
-
1
],
tab_pool
[
n_pools
-
1
]
+
n_cells
,
comparator
);
}
////////simple iterator over all cells
template
<
typename
Comparator
>
dp
::
Iterator
<
cell
>*
iteratorsorted
(
Comparator
comparator
)
{
//first sort each pool with std sort
this
->
sortPools
(
comparator
);
//then iterate with a merge sort
return
new
IteratorSorted
(
*
this
);
}
//todo template also this with a comparator
class
IteratorSorted
:
public
tools
::
dp
::
Iterator
<
cell
>
{
public:
typedef
std
::
pair
<
int
,
cell
*>
cellpair_t
;
//id pointer of pool , cell *
struct
sortcellpair
{
bool
operator
()
(
cellpair_t
&
l
,
cellpair_t
&
r
)
{
return
(
(
*
l
.
second
).
val
<
(
*
r
.
second
).
val
);
}
}
;
IteratorSorted
(
Pool
<
cell
>&
aRef
)
:
ref
(
aRef
),
done
(
true
)
{}
/** \copydoc tools::dp::Iterator::first */
void
first
()
{
for
(
size_t
i
=
1
;
i
<
ref
.
n_pools
;
i
++
)
{
cellpair_t
newwcp
=
cellpair_t
(
i
,
&
(
ref
.
tab_pool
[
i
][
0
])
);
pq
.
push
(
cellpair_t
(
i
,
(
cell
*
)
&
(
ref
.
tab_pool
[
i
][
0
])
)
);
}
next
();
}
/** \copydoc tools::dp::Iterator::next */
void
next
()
{
done
=
(
pq
.
size
()
==
0
);
if
(
!
done
)
{
cellpair_t
current_pair
=
pq
.
top
()
;
pq
.
pop
();
*
this
->
_item
=
*
(
current_pair
.
second
);
//push the next cell of this list if any
int
cell_number
=
current_pair
.
second
-
ref
.
tab_pool
[
current_pair
.
first
]
;
int
current_pool
=
current_pair
.
first
;
if
(
(
current_pool
<
(
ref
.
n_pools
-
1
))
&&
((
cell_number
+
1
)
<
ref
.
TAI_POOL
)
)
// inside a full pool, and cells remaining
{
pq
.
push
(
cellpair_t
(
current_pool
,
&
(
ref
.
tab_pool
[
current_pool
][
cell_number
+
1
])
)
);
}
else
if
(
(
current_pool
==
(
ref
.
n_pools
-
1
))
&&
((
cell_number
+
1
)
<
ref
.
n_cells
)
)
// inside last pool, and cells remaining
{
pq
.
push
(
cellpair_t
(
current_pool
,
&
(
ref
.
tab_pool
[
current_pool
][
cell_number
+
1
])
)
);
}
//otherwise at end of array, dont push anything
}
}
/** \copydoc tools::dp::Iterator::isDone */
bool
isDone
()
{
return
done
;
}
/** \copydoc tools::dp::Iterator::item */
cell
&
item
()
{
return
*
this
->
_item
;
}
private:
std
::
priority_queue
<
cellpair_t
,
std
::
vector
<
cellpair_t
>
,
sortcellpair
>
pq
;
Pool
<
cell
>&
ref
;
bool
done
;
};
//////
////////simple iterator over all cells
dp
::
Iterator
<
cell
>*
iterator
()
{
return
new
Iterator
(
*
this
);
}
/************************************************************/
//avec std::pair ? pour avoir Item, value_type
class
Iterator
:
public
tools
::
dp
::
Iterator
<
cell
>
{
public:
Iterator
(
Pool
<
cell
>&
aRef
)
:
ref
(
aRef
),
done
(
true
)
{}
/** \copydoc tools::dp::Iterator::first */
void
first
()
{
_current_pool
=
1
;
// first pool
_current_cell
=
0
;
done
=
ref
.
n_cells
<
1
;
if
(
!
done
)
*
this
->
_item
=
ref
.
tab_pool
[
_current_pool
][
_current_cell
];
_current_cell
++
;
// next cell that should be read
}
/** \copydoc tools::dp::Iterator::next */
void
next
()
{
if
(
_current_pool
<
(
ref
.
n_pools
-
1
)
&&
_current_cell
<
ref
.
TAI_POOL
)
// inside a full pool, and cells remaining
{
*
this
->
_item
=
ref
.
tab_pool
[
_current_pool
][
_current_cell
];
_current_cell
++
;
return
;
}
else
if
(
_current_pool
<
(
ref
.
n_pools
-
1
)
&&
_current_cell
==
ref
.
TAI_POOL
)
// inside a full pool but no cells remaining
{
//go to next pool
_current_pool
++
;
_current_cell
=
0
;
*
this
->
_item
=
ref
.
tab_pool
[
_current_pool
][
_current_cell
];
_current_cell
++
;
}
else
if
(
_current_pool
==
(
ref
.
n_pools
-
1
)
&&
_current_cell
<
ref
.
n_cells
)
// in last pool and cells remaining
{
*
this
->
_item
=
ref
.
tab_pool
[
_current_pool
][
_current_cell
];
_current_cell
++
;
}
else
// last pools and no cells left, done
{
done
=
true
;
}
}
/** \copydoc tools::dp::Iterator::isDone */
bool
isDone
()
{
return
done
;
}
/** \copydoc tools::dp::Iterator::item */
cell
&
item
()
{
return
*
this
->
_item
;
}
private:
unsigned
int
_current_pool
;
unsigned
int
_current_cell
;
Pool
<
cell
>&
ref
;
bool
done
;
};
private:
/** table de cell, pour usage courant */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment