Commit 94c9f360 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

Merge branch 'feature-a/3764-random-score' into 'dev'

Random read sample for representative

Closes #3764

See merge request !422
parents 190759e6 a5694371
Pipeline #66488 failed with stages
in 9 minutes and 31 seconds
......@@ -101,3 +101,15 @@ float ReadQualityScore::getScore(const Sequence &sequence) const {
percent_quality = GOOD_QUALITY;
return percent_quality * sequence.sequence.size() / GOOD_QUALITY;
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// RandomScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
RandomScore::RandomScore(){srand(1);} // Ensures a deterministic output
RandomScore::~RandomScore(){}
float RandomScore::getScore(const Sequence &sequence) const {
UNUSED(sequence);
return rand() % 500;
}
......@@ -91,4 +91,17 @@ class ReadQualityScore: public VirtualReadScore {
*/
float getScore(const Sequence &sequence) const;
};
/**
* A very simple implementation of VirtualReadScore.
* The score is random.
*/
class RandomScore: public VirtualReadScore {
public:
RandomScore();
~RandomScore();
float getScore(const Sequence &sequence) const;
};
#endif
......@@ -81,7 +81,7 @@ KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
string seed)
:RepresentativeComputer(r),seed(seed),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool try_hard) {
void KmerRepresentativeComputer::compute(VirtualReadScore &readScorer, bool try_hard) {
assert(coverage_reference_length > 0);
assert(required.length() > 0);
is_computed = false;
......@@ -114,10 +114,8 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
index[i]->insert(it->sequence, it->label, false, 0, seeds[i]);
}
// Create a read chooser to have the sequences sorted by length
ReadQualityScore *rlc = new ReadQualityScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
// Create a read chooser to have the sequences sorted on the criteria we want
ReadChooser rc(sequences, readScorer);
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
......@@ -219,7 +217,7 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
if (coverage < THRESHOLD_BAD_COVERAGE && ! try_hard) {
compute(true);
compute(readScorer, true);
delete index[0];
if (cover_longest_run)
......
......@@ -5,6 +5,7 @@
#include <list>
#include "bioreader.hpp"
#include "kmerstore.h"
#include "read_score.h"
using namespace std;
......@@ -13,6 +14,8 @@ using namespace std;
#define THRESHOLD_BAD_COVERAGE .5 /* Threshold below which the representatie
coverage is considered bad */
static ReadQualityScore DEFAULT_READ_SCORE;
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
......@@ -57,7 +60,8 @@ public:
* Compute the representative depending on the parameters set by the functions
* @pre setRequiredSequence() must have been called (with a non-empty string).
*/
virtual void compute(bool try_hard=false) = 0;
virtual void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard=false) = 0;
/**
* @param min_cover: minimal number of reads supporting each position of the
......@@ -129,7 +133,8 @@ public:
/**
* @pre setCoverageReferenceLength() must have been called previously
*/
void compute(bool try_hard = false);
void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard = false);
private:
......
......@@ -20,10 +20,12 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue) {
double nb_expected, int nb_reads_for_evalue,
VirtualReadScore *scorer) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
windowsStorage->setScorer(scorer);
windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
unsigned long long int bp_total = 0;
......
......@@ -11,6 +11,7 @@
#include "windows.h"
#include "read_storage.h"
#include "bioreader.hpp"
#include "read_score.h"
#define NB_BINS_CLONES 10
#define MAX_VALUE_BINS_CLONES 1000
......@@ -51,6 +52,7 @@ class WindowExtractor {
* @param only_labeled_windows: remember only windows from windows_labels
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @param scorer: how reads are scored (only the best ones are keeped for large clones)
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -60,7 +62,8 @@ class WindowExtractor {
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1,
VirtualReadScore *scorer = &DEFAULT_READ_SCORE);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -7,7 +7,7 @@
#include "segment.h"
WindowsStorage::WindowsStorage(map<string, string> &labels)
:windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
:windows_labels(labels),max_reads_per_window(~0),scorer(&DEFAULT_READ_SCORE),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
return sort_all_windows;
......@@ -73,7 +73,7 @@ KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction wi
repComp.setPercentCoverage(percent_cover);
repComp.setRequiredSequence(window);
repComp.setCoverageReferenceLength(getAverageLength(window));
repComp.compute();
repComp.compute(*scorer);
// We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
// - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
......@@ -153,11 +153,15 @@ void WindowsStorage::setIdToAll() {
}
}
void WindowsStorage::setScorer(VirtualReadScore *scorer) {
this->scorer = scorer;
}
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline, list<int> extra_statuses) {
if (! hasWindow(window)) {
// First time we see that window: init
status_by_window[window].resize(STATS_SIZE);
seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
seqs_by_window[window].init(nb_bins, max_value_bins, scorer);
seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
}
......
......@@ -21,6 +21,7 @@
#include "stats.h"
#include "output.h"
#include "../lib/json_fwd.hpp"
#include "read_score.h"
#define NB_BINS 30
#define MAX_VALUE_BINS 500
......@@ -37,7 +38,7 @@ class WindowsStorage {
list<pair <junction, size_t> > sort_all_windows;
map<junction, int> id_by_window;
size_t max_reads_per_window;
ReadQualityScore scorer;
VirtualReadScore *scorer;
/* Parameters for the read storage */
size_t nb_bins;
......@@ -96,7 +97,7 @@ class WindowsStorage {
*/
KmerRepresentativeComputer getRepresentativeComputer(junction window, string seed, size_t min_cover,
float percent_cover, size_t nb_sampled);
float percent_cover, size_t nb_sampled);
/**
* @return a sample of nb_sampled sequences sharing the same window. The
......@@ -172,6 +173,11 @@ class WindowsStorage {
*/
void setMaximalNbReadsPerWindow(size_t max_reads);
/**
* Define what scorer will be used to keep the sequences and compute the representative
*/
void setScorer(VirtualReadScore *scorer);
/**
* Add a new window with its sequence.
* @param window: the window to add
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -w 20 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG $VIDJIL_DATA/test-random-consensus.fa.gz > consensus-default.log
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -w 20 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG --consensus-on-random-sample $VIDJIL_DATA/test-random-consensus.fa.gz > consensus-random.log
!LAUNCH: diff consensus-default.log consensus-random.log
!EXIT_CODE: 1
$ Output should differ: default has a consensus of 52bp (with the spurious insertion)
# Appears twice in the header of the consensus sequence and in the similarity matrix
2:^< .* 52 bp
1:^< CTTTT
$ With random read sample the consensus should not have the spurious insertion (49 bp)
2:^> .* 49 bp
......@@ -453,6 +453,14 @@ int main (int argc, char **argv)
"maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads")
-> group(group) -> transform(string_NO_LIMIT);
VirtualReadScore *readScorer = &DEFAULT_READ_SCORE;
RandomScore randomScore;
app.add_flag_function("--consensus-on-random-sample",
[&readScorer, &randomScore](size_t n) {
UNUSED(n);
readScorer = &randomScore;
}, "for large clones, use a random sample of reads to compute the consensus sequence (instead of a sample of the longest and highest quality reads)")
->group(group) -> level();
// ----------------------------------------------------------------------------------------------------------------------
group = "Clone analysis (second pass)";
......@@ -1056,7 +1064,8 @@ int main (int argc, char **argv)
WindowsStorage *windowsStorage = we.extract(reads, wmer_size,
windows_labels, only_labeled_windows,
keep_unsegmented_as_clone,
expected_value, nb_reads_for_evalue);
expected_value, nb_reads_for_evalue,
readScorer);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment