Commit eedefd37 authored by Thonier Florian's avatar Thonier Florian

Merge branch 'dev' of gitlab.inria.fr:vidjil/vidjil into feature-s/3555-erreur-si-clones-null

parents f11ae76e 1fcf5a1c
Pipeline #72285 passed with stages
in 9 minutes and 6 seconds
......@@ -223,7 +223,7 @@ test_browser-functional:
- make -C browser
- source /etc/profile.d/rvm.sh
- rvm use 2.6.1
- HEADLESS=1 make -C browser/test functional
- HEADLESS=1 make -C browser/test functional BROWSERS=--browsers-from-file
artifacts:
paths:
- browser/
......
# Becomes ../Makefile in a release
# algo/Makefile.algo (git)
# /Makefile (in a release)
.PHONY: all germline vidjil-algo demo test
......
......@@ -61,12 +61,13 @@ int KmerAffectAnalyser::count(const KmerAffect &affect) const{
int KmerAffectAnalyser::minimize(const KmerAffect &affect, int margin, int width) const {
int i = margin ;
int i_stop = MIN(affectations.size() - margin - kms.getS(), seq.length() - width);
uint64_t val_max = 0 ;
int i_max = NO_MINIMIZING_POSITION ;
for (vector<KmerAffect>::const_iterator it = affectations.begin() + margin;
it < affectations.end() - margin && i <= (int) seq.length() - width;
i <= i_stop;
it++, i++) {
......@@ -83,7 +84,7 @@ int KmerAffectAnalyser::minimize(const KmerAffect &affect, int margin, int width
if (i_max == NO_MINIMIZING_POSITION)
return i_max ;
return i_max + (seq.length() - affectations.size() + 1) / 2;
return i_max + kms.getS() / 2;
}
......@@ -217,10 +218,16 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
results.nb_before_right++;
}
left_evalue = kms.getProbabilityAtLeastOrAbove(before,
KmerAffect left_affect = before;
KmerAffect right_affect = after;
if (kms.multiple_in_one) {
left_affect = AFFECT_NOT_UNKNOWN;
right_affect = AFFECT_NOT_UNKNOWN;
}
left_evalue = kms.getProbabilityAtLeastOrAbove(left_affect,
results.nb_before_left,
1 + results.last_pos_max);
right_evalue = kms.getProbabilityAtLeastOrAbove(after,
right_evalue = kms.getProbabilityAtLeastOrAbove(right_affect,
results.nb_after_right,
seq.size() - 1 - results.first_pos_max);
......@@ -244,7 +251,11 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
double KmerAffectAnalyser::getProbabilityAtLeastOrAbove(const KmerAffect &kmer, int at_least) const {
return kms.getProbabilityAtLeastOrAbove(kmer, at_least, seq.size());
KmerAffect affect = kmer;
if (kms.multiple_in_one) {
affect = AFFECT_NOT_UNKNOWN;
}
return kms.getProbabilityAtLeastOrAbove(affect, at_least, seq.size());
}
pair <double, double> KmerAffectAnalyser::getLeftRightProbabilityAtLeastOrAbove() const {
......
......@@ -169,12 +169,14 @@ void PointerACAutomaton<Info>::insert(const seqtype &seq, Info info) {
pointer_state<Info> *state = getInitialState();
size_t seq_length = seq.length();
size_t i;
bool existing_final = true;
for (i = 0; i < seq_length && state->transition(seq[i]) != NULL; i++) {
state = state->transition(seq[i]);
}
if (i < seq_length) {
existing_final = false;
// Need to create more states
for (; i < seq_length; i++) {
pointer_state<Info> *new_state = new pointer_state<Info>();
......@@ -183,16 +185,14 @@ void PointerACAutomaton<Info>::insert(const seqtype &seq, Info info) {
}
}
state->is_final = true;
if (state->informations.front().isNull()) {
if (! existing_final) {
this->nb_kmers_inserted++;
this->kmers_inserted[info]++;
state->informations.front() += info;
} else {
if (this->multiple_info)
state->informations.push_back(info);
else
state->informations.front() += info;
}
if (state->informations.front().isNull() || ! this->multiple_info)
state->informations.front() += info;
else
state->informations.push_back(info);
}
template <class Info>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
......@@ -387,6 +387,7 @@ void MultiGermline::build_with_one_index(string seed, bool set_index)
index = KmerStoreFactory<KmerAffect>::createIndex(indexType, expand_seed(seed), rc);
index->refs = 1;
insert_in_one_index(index, set_index);
index->multiple_in_one = true;
}
void MultiGermline::finish() {
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
......@@ -82,6 +82,7 @@ public:
static int last_id;
int id; // id of this index
int refs; // number of germlines using this index
bool multiple_in_one;
list< pair <T, BioReader> > labels;
......@@ -205,6 +206,7 @@ IKmerStore<T>::IKmerStore() {
id = ++last_id;
refs = 0;
finished_building = false;
multiple_in_one = false;
}
template<class T> int IKmerStore<T>::last_id = 0;
......
......@@ -4,6 +4,35 @@
#include <cstdlib>
#include "tools.h"
#include "lib/json.hpp"
using nlohmann::json;
json load_into_map_from_json(map <string, string> &the_map, string json_file)
{
if (!json_file.size())
return {};
cout << " <== " << json_file << endl ;
std::ifstream json_file_stream(json_file);
json j;
json_file_stream >> j;
json jj = j["config"]["labels"] ;
int n = 0;
for(json::iterator label = jj.begin(); label != jj.end(); ++label) {
string name = (*label)["name"].get<std::string>();
string sequence = (*label)["sequence"].get<std::string>();
the_map[sequence] = name;
n++ ;
}
cout << " ==> " << n << " labels" << endl;
return jj;
}
void load_into_map(map <string, string> &the_map, string map_file, string default_value)
{
......
......@@ -7,4 +7,4 @@
#include "bioreader.hpp"
void load_into_map(map <string, string> &the_map, string map_file, string default_value);
json load_into_map_from_json(map <string, string> &the_map, string json_file);
......@@ -105,6 +105,7 @@ SampleOutput::~SampleOutput()
void SampleOutput::out(ostream &s)
{
UNUSED(s);
}
void SampleOutput::addClone(junction junction, CloneOutput *clone)
......
......@@ -101,3 +101,15 @@ float ReadQualityScore::getScore(const Sequence &sequence) const {
percent_quality = GOOD_QUALITY;
return percent_quality * sequence.sequence.size() / GOOD_QUALITY;
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// RandomScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
RandomScore::RandomScore(){srand(1);} // Ensures a deterministic output
RandomScore::~RandomScore(){}
float RandomScore::getScore(const Sequence &sequence) const {
UNUSED(sequence);
return rand() % 500;
}
......@@ -91,4 +91,17 @@ class ReadQualityScore: public VirtualReadScore {
*/
float getScore(const Sequence &sequence) const;
};
/**
* A very simple implementation of VirtualReadScore.
* The score is random.
*/
class RandomScore: public VirtualReadScore {
public:
RandomScore();
~RandomScore();
float getScore(const Sequence &sequence) const;
};
#endif
......@@ -81,7 +81,7 @@ KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
string seed)
:RepresentativeComputer(r),seed(seed),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool try_hard) {
void KmerRepresentativeComputer::compute(VirtualReadScore &readScorer, bool try_hard) {
assert(coverage_reference_length > 0);
assert(required.length() > 0);
is_computed = false;
......@@ -114,10 +114,8 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
index[i]->insert(it->sequence, it->label, false, 0, seeds[i]);
}
// Create a read chooser to have the sequences sorted by length
ReadQualityScore *rlc = new ReadQualityScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
// Create a read chooser to have the sequences sorted on the criteria we want
ReadChooser rc(sequences, readScorer);
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
......@@ -219,7 +217,7 @@ void KmerRepresentativeComputer::compute(bool try_hard) {
if (coverage < THRESHOLD_BAD_COVERAGE && ! try_hard) {
compute(true);
compute(readScorer, true);
delete index[0];
if (cover_longest_run)
......
......@@ -5,6 +5,7 @@
#include <list>
#include "bioreader.hpp"
#include "kmerstore.h"
#include "read_score.h"
using namespace std;
......@@ -13,6 +14,8 @@ using namespace std;
#define THRESHOLD_BAD_COVERAGE .5 /* Threshold below which the representatie
coverage is considered bad */
static ReadQualityScore DEFAULT_READ_SCORE;
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
......@@ -57,7 +60,8 @@ public:
* Compute the representative depending on the parameters set by the functions
* @pre setRequiredSequence() must have been called (with a non-empty string).
*/
virtual void compute(bool try_hard=false) = 0;
virtual void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard=false) = 0;
/**
* @param min_cover: minimal number of reads supporting each position of the
......@@ -129,7 +133,8 @@ public:
/**
* @pre setCoverageReferenceLength() must have been called previously
*/
void compute(bool try_hard = false);
void compute(VirtualReadScore & readScorer = DEFAULT_READ_SCORE,
bool try_hard = false);
private:
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......@@ -361,7 +361,7 @@ string Segmenter::getInfoLine() const
{
string s = "" ;
s += (segmented ? "" : "! ") + info ;
s += (segmented ? "" : "\t ! ") + info ;
s += " " + info_extra ;
s += " " + segmented_germline->code ;
s += " " + string(segmented_mesg[because]) ;
......@@ -759,7 +759,7 @@ void Segmenter::setSegmentationStatus(int status) {
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
AlignBox *box_left, AlignBox *box_right,
Cost segment_cost)
Cost segment_cost, bool reverse_V, bool reverse_J)
{
// Overlap size
int overlap = box_left->end - box_right->start + 1;
......@@ -773,7 +773,7 @@ string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
int score_l[overlap+1];
//LEFT
DynProg dp_l = DynProg(seq_left, box_left->ref,
DynProg dp_l = DynProg(seq_left, revcomp(box_left->ref, reverse_V),
DynProg::Local, segment_cost);
score_l[0] = dp_l.compute();
......@@ -781,6 +781,7 @@ string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
//RIGHT
// reverse right sequence
string ref_right=string(box_right->ref.rbegin(), box_right->ref.rend());
ref_right = revcomp(ref_right, reverse_J);
seq_right=string(seq_right.rbegin(), seq_right.rend());
......@@ -801,11 +802,15 @@ string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
cout << dp_l ;
cout << dp_r ;
cout << "=== check_and_resolve_overlap" << endl;
cout << seq << endl;
cout << "boxes: " << *box_left << "/" << *box_right << endl ;
// cout << dp_l ;
// cout << dp_r ;
cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
cout << "ref:" << ref_left << "\t\t" << ref_right << endl;
cout << "ref:" << box_left->ref << "\t\t" << ref_right << endl;
for(int i=0; i<=overlap; i++)
cout << i << " left: " << score_l[i] << "/" << trim_l[i] << " right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif
......@@ -839,6 +844,7 @@ string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
<< " left: " << best_i << "-" << box_left->del_right << " @" << box_left->end
<< " right:" << best_j << "-" << box_right->del_left << " @" << box_right->start
<< endl;
cout << "boxes: " << *box_left << " / " << *box_right << endl ;
#endif
} // end if (overlap > 0)
......@@ -1113,7 +1119,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
//overlap VJ
seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
box_V, box_J, segment_cost);
box_V, box_J, segment_cost, reverse_V, reverse_J);
// Reset extreme positions
box_V->start = 0;
......
......@@ -148,6 +148,7 @@ ostream &operator<<(ostream &out, const AlignBox &box);
* @param seq_begin, seq_end: the positions to consider on 'seq' for the two sequences that may overlap
* @param *box_left, *box_right the two boxes
* @param segment_cost: the cost used by the dynamic programing
* @param reverse_V, reverse_J should we revcomp the sequence on 5' or 3'?
*
* @post box_left->del_left and box_right->del_right are set to the best number of nucleotides to trim in order to remove the overlap.
* box_left->end and box_right->start are shifted by the good number of nucleotides
......@@ -157,7 +158,8 @@ ostream &operator<<(ostream &out, const AlignBox &box);
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
AlignBox *box_left, AlignBox *box_right,
Cost segment_cost);
Cost segment_cost, bool reverse_V = false,
bool reverse_J = false);
class Segmenter {
protected:
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
Copyright (C) 2011-2019 by VidjilNet consortium and Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
......
......@@ -20,10 +20,12 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue) {
double nb_expected, int nb_reads_for_evalue,
VirtualReadScore *scorer) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
windowsStorage->setScorer(scorer);
windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
unsigned long long int bp_total = 0;
......
......@@ -11,6 +11,7 @@
#include "windows.h"
#include "read_storage.h"
#include "bioreader.hpp"
#include "read_score.h"
#define NB_BINS_CLONES 10
#define MAX_VALUE_BINS_CLONES 1000
......@@ -51,6 +52,7 @@ class WindowExtractor {
* @param only_labeled_windows: remember only windows from windows_labels
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @param scorer: how reads are scored (only the best ones are keeped for large clones)
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -60,7 +62,8 @@ class WindowExtractor {
size_t w,
map<string, string> &windows_labels, bool only_labeled_windows=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1);
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1,
VirtualReadScore *scorer = &DEFAULT_READ_SCORE);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -7,7 +7,7 @@
#include "segment.h"
WindowsStorage::WindowsStorage(map<string, string> &labels)
:windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
:windows_labels(labels),max_reads_per_window(~0),scorer(&DEFAULT_READ_SCORE),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
return sort_all_windows;
......@@ -73,7 +73,7 @@ KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction wi
repComp.setPercentCoverage(percent_cover);
repComp.setRequiredSequence(window);
repComp.setCoverageReferenceLength(getAverageLength(window));
repComp.compute();
repComp.compute(*scorer);
// We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
// - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
......@@ -153,11 +153,15 @@ void WindowsStorage::setIdToAll() {
}
}
void WindowsStorage::setScorer(VirtualReadScore *scorer) {
this->scorer = scorer;
}
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline, list<int> extra_statuses) {
if (! hasWindow(window)) {
// First time we see that window: init
status_by_window[window].resize(STATS_SIZE);
seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
seqs_by_window[window].init(nb_bins, max_value_bins, scorer);
seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
}
......
......@@ -21,6 +21,7 @@
#include "stats.h"
#include "output.h"
#include "../lib/json_fwd.hpp"
#include "read_score.h"
#define NB_BINS 30
#define MAX_VALUE_BINS 500
......@@ -37,7 +38,7 @@ class WindowsStorage {
list<pair <junction, size_t> > sort_all_windows;
map<junction, int> id_by_window;
size_t max_reads_per_window;
ReadQualityScore scorer;
VirtualReadScore *scorer;
/* Parameters for the read storage */
size_t nb_bins;
......@@ -96,7 +97,7 @@ class WindowsStorage {
*/
KmerRepresentativeComputer getRepresentativeComputer(junction window, string seed, size_t min_cover,
float percent_cover, size_t nb_sampled);
float percent_cover, size_t nb_sampled);
/**
* @return a sample of nb_sampled sequences sharing the same window. The
......@@ -172,6 +173,11 @@ class WindowsStorage {
*/
void setMaximalNbReadsPerWindow(size_t max_reads);
/**
* Define what scorer will be used to keep the sequences and compute the representative
*/
void setScorer(VirtualReadScore *scorer);
/**
* Add a new window with its sequence.
* @param window: the window to add
......
This source diff could not be displayed because it is too large. You can view the blob instead.
// From CLI11 examples
#include "CLI11.hpp"
#include "json.hpp"
// This example is only built on GCC 7 on Travis due to mismatch in stdlib
// for clang (CLI11 is forgiving about mismatches, json.hpp is not)
using nlohmann::json;
class ConfigJSON : public CLI::Config {
public:
std::string to_config(const CLI::App *app, bool default_also, bool, std::string) const override {
json j;
for(const CLI::Option *opt : app->get_options({})) {
// Only process option with a long-name and configurable
if(!opt->get_lnames().empty() && opt->get_configurable()) {
std::string name = opt->get_lnames()[0];
// Non-flags
if(opt->get_type_size() != 0) {
// If the option was found on command line
if(opt->count() == 1)
j[name] = opt->results().at(0);
else if(opt->count() > 1)
j[name] = opt->results();
// If the option has a default and is requested by optional argument
else if(default_also && !opt->get_defaultval().empty())
j[name] = opt->get_defaultval();
// Flag, one passed
} else if(opt->count() == 1) {
j[name] = true;
// Flag, multiple passed
} else if(opt->count() > 1) {
j[name] = opt->count();
// Flag, not present
} else if(opt->count() == 0 && default_also) {
j[name] = false;
}
}
}
for(const CLI::App *subcom : app->get_subcommands({}))
j[subcom->get_name()] = json(to_config(subcom, default_also, false, ""));
return j.dump(4);
}
std::vector<CLI::ConfigItem> from_config(std::istream &input) const override {
json j;
input >> j;
return _from_config(j["config"]);
}
std::vector<CLI::ConfigItem>
_from_config(json j, std::string name = "", std::vector<std::string> prefix = {}) const {
std::vector<CLI::ConfigItem> results;
if(j.is_object()) {
for(json::iterator item = j.begin(); item != j.end(); ++item) {
auto copy_prefix = prefix;
if(!name.empty())