Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit a6383d82 authored by Mikaël Salson's avatar Mikaël Salson
Browse files

algo/: Use the same scorer for read filtering and representative computation

This was put in two different places and could lead to inconsistencies. Now the
reads are necessarily selected in the same way.
parent fb575f32
......@@ -81,7 +81,7 @@ KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
string seed)
:RepresentativeComputer(r),seed(seed),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool try_hard) {
void KmerRepresentativeComputer::compute(VirtualReadScore &readScorer, bool try_hard) {
assert(coverage_reference_length > 0);
assert(required.length() > 0);
is_computed = false;
......@@ -115,9 +115,7 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
}
// Create a read chooser to have the sequences sorted on the criteria we want
VirtualReadScore *rlc = new RandomScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
ReadChooser rc(sequences, readScorer);
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
......@@ -219,7 +217,7 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
if (coverage < THRESHOLD_BAD_COVERAGE && ! try_hard) {
compute(true);
compute(readScorer, true);
delete index[0];
if (cover_longest_run)
......
......@@ -5,6 +5,7 @@
#include <list>
#include "bioreader.hpp"
#include "kmerstore.h"
#include "read_score.h"
using namespace std;
......@@ -13,6 +14,8 @@ using namespace std;
#define THRESHOLD_BAD_COVERAGE .5 /* Threshold below which the representatie
coverage is considered bad */
static ReadQualityScore DEFAULT_READ_SCORE;
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
......@@ -57,7 +60,8 @@ public:
* Compute the representative depending on the parameters set by the functions
* @pre setRequiredSequence() must have been called (with a non-empty string).
*/
virtual void compute(bool try_hard=false) = 0;
virtual void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard=false) = 0;
/**
* @param min_cover: minimal number of reads supporting each position of the
......@@ -129,7 +133,8 @@ public:
/**
* @pre setCoverageReferenceLength() must have been called previously
*/
void compute(bool try_hard = false);
void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard = false);
private:
......
......@@ -20,10 +20,12 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue) {
double nb_expected, int nb_reads_for_evalue,
VirtualReadScore *scorer) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
windowsStorage->setScorer(scorer);
windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
unsigned long long int bp_total = 0;
......
......@@ -11,6 +11,7 @@
#include "windows.h"
#include "read_storage.h"
#include "bioreader.hpp"
#include "read_score.h"
#define NB_BINS_CLONES 10
#define MAX_VALUE_BINS_CLONES 1000
......@@ -51,6 +52,7 @@ class WindowExtractor {
* @param only_labeled_windows: remember only windows from windows_labels
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @param scorer: how reads are scored (only the best ones are keeped for large clones)
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -60,7 +62,8 @@ class WindowExtractor {
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1,
VirtualReadScore *scorer = &DEFAULT_READ_SCORE);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -7,7 +7,7 @@
#include "segment.h"
WindowsStorage::WindowsStorage(map<string, string> &labels)
:windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
:windows_labels(labels),max_reads_per_window(~0),scorer(&DEFAULT_READ_SCORE),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
return sort_all_windows;
......@@ -73,7 +73,7 @@ KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction wi
repComp.setPercentCoverage(percent_cover);
repComp.setRequiredSequence(window);
repComp.setCoverageReferenceLength(getAverageLength(window));
repComp.compute();
repComp.compute(*scorer);
// We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
// - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
......@@ -153,11 +153,15 @@ void WindowsStorage::setIdToAll() {
}
}
void WindowsStorage::setScorer(VirtualReadScore *scorer) {
this->scorer = scorer;
}
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline, list<int> extra_statuses) {
if (! hasWindow(window)) {
// First time we see that window: init
status_by_window[window].resize(STATS_SIZE);
seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
seqs_by_window[window].init(nb_bins, max_value_bins, scorer);
seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
}
......
......@@ -21,6 +21,7 @@
#include "stats.h"
#include "output.h"
#include "../lib/json_fwd.hpp"
#include "read_score.h"
#define NB_BINS 30
#define MAX_VALUE_BINS 500
......@@ -37,7 +38,7 @@ class WindowsStorage {
list<pair <junction, size_t> > sort_all_windows;
map<junction, int> id_by_window;
size_t max_reads_per_window;
ReadQualityScore scorer;
VirtualReadScore *scorer;
/* Parameters for the read storage */
size_t nb_bins;
......@@ -96,7 +97,7 @@ class WindowsStorage {
*/
KmerRepresentativeComputer getRepresentativeComputer(junction window, string seed, size_t min_cover,
float percent_cover, size_t nb_sampled);
float percent_cover, size_t nb_sampled);
/**
* @return a sample of nb_sampled sequences sharing the same window. The
......@@ -172,6 +173,11 @@ class WindowsStorage {
*/
void setMaximalNbReadsPerWindow(size_t max_reads);
/**
* Define what scorer will be used to keep the sequences and compute the representative
*/
void setScorer(VirtualReadScore *scorer);
/**
* Add a new window with its sequence.
* @param window: the window to add
......
......@@ -1056,7 +1056,8 @@ int main (int argc, char **argv)
WindowsStorage *windowsStorage = we.extract(reads, wmer_size,
windows_labels, only_labeled_windows,
keep_unsegmented_as_clone,
expected_value, nb_reads_for_evalue);
expected_value, nb_reads_for_evalue,
readScorer);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment