Commit 3dacebba authored by Mikael Salson's avatar Mikael Salson Committed by Mathieu Giraud

windows: read must be sampled depending on the score

The sample was constituted only on the sequence length which made sense
when the reads were stored depending on their length. But now the scoring
function changed and we focus on quality. It does not make sense anymore
to retrieve the longest reads then. We just want to have a sample of the
best reads (ie. those of better quality).

Using that, if we change our scoring function again, no modification
will be needed.

Note that the second parameter of getBestReads() is not used yet.
It may be useful to prevent too bad sequences to be sampled.

SequenceSampler is not used anymore. It will be removed in a future release.
parent 520a3f55
......@@ -153,6 +153,29 @@ list<Sequence> BinReadStorage::getReads() const {
return results;
}
list<Sequence> BinReadStorage::getBestReads(size_t max_nb, size_t min_score) const {
list<Sequence>best_reads;
size_t smallest_interesting_bin = max(smallest_bin_not_empty, scoreToBin(min_score));
for (size_t i = nb_bins+1; i > smallest_interesting_bin; i--) {
size_t j = i-1;
if (bins[j].size() > 0) {
if (bins[j].size() <= max_nb) {
best_reads.insert(best_reads.end(), bins[j].begin(), bins[j].end());
max_nb -= bins[j].size();
} else {
for (list<Sequence>::iterator it = bins[j].begin(); max_nb > 0 && it != bins[j].end();
it++) {
best_reads.push_back(*it);
max_nb--;
}
}
}
}
return best_reads;
}
string BinReadStorage::getLabel() const {
return label;
}
......@@ -169,7 +192,7 @@ void BinReadStorage::out_average_scores(ostream &out, bool inversed) {
output_label_average(out, getLabel(), getNbScores(), inversed ? getInvertedAverageScore() : getAverageScore(), inversed ? 3 : 1);
}
size_t BinReadStorage::scoreToBin(float score) {
size_t BinReadStorage::scoreToBin(float score) const{
assert(score >= 0);
if (score > max_score)
return nb_bins;
......
......@@ -55,6 +55,12 @@ class VirtualReadStorage {
* @return all the stored reads
*/
virtual list<Sequence> getReads() const = 0;
/**
* @return at most max_nb reads whose score >= min_score
*/
virtual list<Sequence> getBestReads(size_t max_nb, size_t min_score=0) const = 0;
};
/**
......@@ -182,6 +188,14 @@ public:
list<Sequence> getReads() const;
/**
* @inherited from VirtualReadScore
* The implementation does not guarantee that no sequence will be below min_score.
* As the implementation relies on bins, the score will be inferred depending on the bin
* the sequence belongs to.
*/
list<Sequence> getBestReads(size_t max_nb, size_t min_score=0) const;
/**
* Set the label of the statistics
*/
......@@ -193,7 +207,7 @@ public:
/**
* @return the bin a sequence of the given score must lie.
*/
size_t scoreToBin(float score);
size_t scoreToBin(float score) const;
/**
* Search for a largest value such that the bin is not empty.
......
......@@ -3,7 +3,6 @@
#include "tools.h"
#include "windows.h"
#include "representative.h"
#include "sequenceSampler.h"
#include "segment.h"
WindowsStorage::WindowsStorage(map<string, string> &labels)
......@@ -88,10 +87,7 @@ KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction wi
list<Sequence> WindowsStorage::getSample(junction window, size_t nb_sampled,
size_t nb_buckets) {
list<Sequence> reads = getReads(window);
if (reads.size() <= nb_sampled)
return reads;
return SequenceSampler(reads).getLongest(nb_sampled, nb_buckets);
return seqs_by_window[window].getBestReads(nb_sampled);
}
set<Germline *> WindowsStorage::getTopGermlines(size_t top, size_t min_reads) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment