Commit 17de34f6 authored by flothoni's avatar flothoni
Browse files

Merge branch 'dev' of gitlab.inria.fr:vidjil/vidjil into feature-s/4171-parametres-flash

parents c835e3e2 39dd01f7
Pipeline #149740 passed with stages
in 8 minutes and 28 seconds
......@@ -56,6 +56,7 @@ test_germlines:
include:
- local: '/doc/.gitlab-ci.yml'
- local: 'algo/.gitlab-ci-compilers.yml' # Stage multiple_tests
- local: 'algo/.gitlab-ci.yml' # Vidjil-algo pipelines
# Algorithm
......@@ -372,12 +373,13 @@ ff45-server-functional:
benchmark_algo:
image: gcc:6.3
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
extends: .install-algo-dependencies
stage: benchmark
script:
- cd algo/tests ; python3 benchmark-releases.py -bic
- cd algo/tests ; python3 benchmark-releases.py -r 3 -bIc
artifacts:
paths:
- algo/tests/benchmark.log
when: manual
only:
- /^feature-.*a.*\/.*$/
......
.install-algo-dependencies:
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
.testing-compilers:
extends: .install-algo-dependencies
stage: multiple_tests
tags:
- cidocker
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
script:
- $CXX --version
- make demo data germline
......
profiling_algo:
stage: benchmark
image: gcc:9
before_script:
- apt-get update
- apt-get install -y wget python3 tar libgoogle-perftools4 libgoogle-perftools-dev google-perftools graphviz
script:
- make demo data germline
- make DEBUG="-g"
- LIB_PROFILE=$(find /usr/lib -name libprofiler.so)
- CPUPROFILE=vidjil.cpu LD_PRELOAD="$LIB_PROFILE" ./vidjil-algo -g germline -r 1 demo/LIL-L4.fastq.gz
- LIB_MALLOC=$(find /usr/lib -name libtcmalloc.so)
- HEAPPROFILE=vidjil.mem LD_PRELOAD="$LIB_MALLOC" ./vidjil-algo -g germline -r 1 demo/LIL-L4.fastq.gz
- google-pprof --lines --text vidjil-algo vidjil.cpu
- echo "###########################################"
- google-pprof --lines --text vidjil-algo vidjil.mem.*
- google-pprof --lines --pdf vidjil-algo vidjil.cpu > vidjil-cpu.pdf
- google-pprof --lines --pdf vidjil-algo vidjil.mem.* > vidjil-mem.pdf
artifacts:
paths:
- vidjil-*.pdf
- vidjil.cpu
- vidjil.mem.*
when: manual
only:
- /^feature-.*a.*\/.*$/
tags:
- cidocker
#ifndef AUTOMATON_H
#define AUTOMATON_H
#include <list>
#include <vector>
#include <cctype>
#include "kmerstore.h"
#include <cstdlib>
......@@ -53,7 +53,7 @@ public:
/**
* @return the information stored for this state
*/
virtual list<Info> &getInfo(void *state) = 0;
virtual vector<Info> &getInfo(void *state) = 0;
/**
* @param starting_state: the starting state for traversing the automate.
......@@ -104,7 +104,7 @@ class pointer_state {
public:
pointer_state<Info> *transitions[NB_TRANSITIONS]; /* Transitions to the 5 nt */
bool is_final;
list<Info> informations; /* != NULL when is_final */
vector<Info> informations; /* != NULL when is_final */
pointer_state():is_final(false),informations() {
for (size_t i = 0; i < NB_TRANSITIONS; i++)
......@@ -164,7 +164,7 @@ public:
/**
* @return the information stored for this state
*/
list<Info> &getInfo(void *state);
vector<Info> &getInfo(void *state);
/**
* @return the automaton initial state
......
......@@ -4,6 +4,7 @@
#include "automaton.h"
#include <stack>
#include <set>
#include <list>
//////////////////// IMPLEMENTATIONS ////////////////////
template <class Info>
......@@ -160,7 +161,7 @@ void PointerACAutomaton<Info>::build_failure_functions() {
}
template <class Info>
list<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
vector<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
return ((pointer_state<Info> *)state)->informations;
}
......
......@@ -178,7 +178,7 @@ namespace std {
template <>
struct hash<KmerAffect> {
size_t operator()(const KmerAffect &affect) const {
return (affect.getLabel()[0] << 8) | affect.getLength();
return (((unsigned char) affect.affect.c << 8) | (affect.getLength()));
}
};
}
......
......@@ -46,6 +46,10 @@ string Kmer::getLabel() const{
return "";
}
int Kmer::getStrand() const{
return 1;
}
size_t Kmer::getLength() const{
return 10;
}
......
......@@ -9,13 +9,13 @@
#include <math.h>
#include "bioreader.hpp"
#include "tools.h"
#include "proba.h"
using namespace std;
typedef
enum { KMER_INDEX, AC_AUTOMATON } IndexTypes;
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
class Kmer {
public:
unsigned int count;
......@@ -57,6 +57,11 @@ public:
* it only there for a reason of compatibility with KmerAffect)
*/
bool isAmbiguous() const;
/**
* @return 1 (only there for compatibility reasons with KmerAffect)
*/
int getStrand() const;
} ;
ostream &operator<<(ostream &os, const Kmer &kmer);
bool operator==(const Kmer &k1, const Kmer &k2);
......@@ -80,9 +85,8 @@ protected:
size_t nb_kmers_inserted;
size_t max_size_indexing;
bool finished_building;
map<float, vector<double> > precomputed_proba_with_system,
precomputed_proba_without_system;
ProbaPrecomputer proba;
public:
virtual ~IKmerStore();
......@@ -377,61 +381,20 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
return max_size_indexing ;
}
template<class T>
void IKmerStore<T>::precompute_proba(float index_load) {
precomputed_proba_with_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
vector<double> &pproba_with = precomputed_proba_with_system[index_load];
vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad(kmer) ;
if (! precomputed_proba_without_system.count(index_load)) {
precompute_proba(index_load);
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
if (precomputed_proba_with_system.at(index_load).size() > (size_t)at_least)
probability_having_system = precomputed_proba_with_system.at(index_load)[at_least];
else
probability_having_system = pow(index_load, at_least);
if (precomputed_proba_without_system.at(index_load).size() > (size_t)n - at_least)
probability_not_having_system = precomputed_proba_without_system.at(index_load)[n-at_least];
else
probability_not_having_system = pow(1 - index_load, n - at_least);
for (int i=at_least; i<=n; i++) {
proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
probability_having_system *= index_load;
probability_not_having_system /= (1 - index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
int kmer_length = kmer.getLength();
if (kmer.isUnknown() || kmer.isAmbiguous() || kmer == AFFECT_NOT_UNKNOWN
|| kmer.getLength() == ~0)
kmer_length = getS();
int n = max(0, length - kmer_length + 1);
at_least = min(at_least, n);
return proba.getProba(index_load, at_least, n);
}
template<class T>
int IKmerStore<T>::getK() const {
return k;
......
#include "proba.h"
#include <math.h>
#include "tools.h"
// ProbaPrecomputer::ProbaPrecomputer() {}
void ProbaPrecomputer::precomputeSystemProba(float index_load) {
precomputed_proba_with_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
std::vector<double> &pproba_with = precomputed_proba_with_system[index_load];
std::vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
precomputed_proba[index_load] = std::vector<std::vector<double> >(MAX_PRECOMPUTED_PROBA);
}
void ProbaPrecomputer::precomputeProba(float index_load, int length) {
if (length < MAX_PRECOMPUTED_PROBA) {
// By definition of MAX_PRECOMPUTED_PROBA they exist
double probability_having_system = precomputed_proba_with_system.at(index_load)[length];
double probability_not_having_system = 1;
precomputed_proba[index_load][length] = std::vector<double>(length+1);
std::vector<double> &precomp_proba = precomputed_proba[index_load][length];
double proba = 0;
precomp_proba[0] = proba;
double Cnk = 1; // nChoosek(length, length);
for (int i=length; i>=1; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
precomp_proba[i] = proba;
}
}
}
double ProbaPrecomputer::probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load) {
double proba = Cnk * proba_with * proba_without;
proba_with = getProbaWith(index_load, iteration-1); // Otherwise when probability are too low they are stored as 0 and we are f*****
proba_without *= (1 - index_load);
Cnk *= iteration*1. / (length-iteration+1);
return proba;
}
double ProbaPrecomputer::getProbaWith(double p, int nb) {
if (nb < MAX_PRECOMPUTED_PROBA)
return precomputed_proba_with_system.at(p)[nb];
return pow(p, nb);
}
double ProbaPrecomputer::getProba(float index_load, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
if (! precomputed_proba_without_system.count(index_load)) {
precomputeSystemProba(index_load);
}
if (length < MAX_PRECOMPUTED_PROBA) {
if (! precomputed_proba[index_load][length].size())
precomputeProba(index_load, length);
if (precomputed_proba[index_load][length].size() > 0) {
return precomputed_proba[index_load][length][at_least];
}
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
probability_having_system = pow(index_load, length);
probability_not_having_system = 1;
double Cnk = 1;
for (int i=length; i >= at_least; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
}
#ifndef PROBA_H
#define PROBA_H
#include <map>
#include <vector>
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
/**
* Store probabilities for sequences whose length is < MAX_PRECOMPUTED_PROBA
*/
class ProbaPrecomputer {
private:
// Store for each index load, the probability that have been computed for
// each possiible at_least and length values. To be used as
// precomputed_proba[index_load][length][at_least]
std::map<float, std::vector<std::vector<double> > > precomputed_proba;
std::map<float, std::vector<double> > precomputed_proba_with_system, precomputed_proba_without_system;
/**
* Precompute index_load^n and (1-index_load)^n (for n=0...MAX_PRECOMPUTED_PROBA)
*/
void precomputeSystemProba(float index_load);
/**
* Precompute the results of getProba (if length < MAX_PRECOMPUTED_PROBA)
* for every possible values of at_least between 1 and length
*/
void precomputeProba(float index_load, int length);
/**
* Commpute the previous iteration for the probability (and updates the values Cnk, proba_with,
* proba_without.
*/
inline double probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load);
public:
/**
* @return probability that the number of kmers (with index load 'index_load')
* is 'at_least' or more in a sequence of length 'length'
*/
double getProba(float index_load, int at_least, int length);
/**
* @return p^nb (potentially pre-computed)
*/
double getProbaWith(double p, int nb);
friend void testProba1();
};
#endif
......@@ -895,12 +895,9 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
if (local==true){
dp.backtrack();
}
if (score > best_score)
{
{
dp.backtrack();
best_score = score ;
best_best_i = dp.best_i ;
best_best_j = dp.best_j ;
......@@ -908,9 +905,6 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
best_first_j = dp.first_j ;
box->ref_nb = r ;
box->ref_label = rep.label(r) ;
if (!local)
dp.backtrack();
box->marked_pos = dp.marked_pos_i ;
}
......
......@@ -457,6 +457,20 @@ void json_add_warning(json &clone, string code, string msg, string level)
clone["warn"] += { {"code", code}, {"level", level}, {"msg", msg} } ;
}
// Signal handling
bool global_interrupted;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
void sigintHandler(int sig_num)
{
signal(SIGINT, sigintHandler);
global_interrupted = true;
}
#pragma GCC diagnostic pop
/*
Return the part of label before the star
For example:
......
......@@ -41,6 +41,7 @@ typedef string junction ;
#include <iomanip>
#include <string>
#include <cassert>
#include <signal.h>
#include <vector>
#include "bioreader.hpp"
#include "../lib/gzstream.h"
......@@ -96,6 +97,13 @@ inline int spaced_int(int *input, const string &seed) {
}
/* Signal handling */
extern bool global_interrupted;
void sigintHandler(int sig_num);
/*
Extract the gene name from a label. This take the whole part
before the star and returns it. If there is no star in the
......
......@@ -18,7 +18,8 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
map<string, string> &windows_labels, bool only_labeled_windows,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue,
VirtualReadScore *scorer) {
VirtualReadScore *scorer,
SampleOutput *output) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
......@@ -27,8 +28,19 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
unsigned long long int bp_total = 0;
global_interrupted = false ;
signal(SIGINT, sigintHandler);
while (reads->hasNext()) {
if (global_interrupted)
{
string msg = "Interrupted after processing " + string_of_int(nb_reads) + " reads" ;
if (output) output->add_warning("W09", msg, LEVEL_WARN);
cout << WARNING_STRING << msg << endl ;
break;
}
try {
reads->next();
}
......@@ -119,6 +131,7 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
cout.flush() ;
}
}
signal(SIGINT, SIG_DFL);
cout << endl ;
......
......@@ -12,6 +12,7 @@
#include "read_storage.h"
#include "bioreader.hpp"
#include "read_score.h"
#include "output.h"
#define NB_BINS_CLONES 10
#define MAX_VALUE_BINS_CLONES 1000
......@@ -53,6 +54,7 @@ class WindowExtractor {
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @param scorer: how reads are scored (only the best ones are keeped for large clones)
* @param output: global output, used here for warnings
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -63,7 +65,8 @@ class WindowExtractor {
map<string, string> &windows_labels, bool only_labeled_windows=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1,
VirtualReadScore *scorer = &DEFAULT_READ_SCORE);
VirtualReadScore *scorer = &DEFAULT_READ_SCORE,
SampleOutput *output = NULL);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -5,10 +5,14 @@ SRC = DEST + 'src/'
BIN = DEST + 'bin/'
RUN = DEST + 'run/'
OUT = 'benchmark.log'
CURRENT = 'HEAD'
#####
WARN_RATIO = 0.10
LIMIT1e5 = '-x 100000 '
LIMIT1e4 = '-x 10000 '
LIMIT1e3 = '-x 1000 '
......@@ -23,23 +27,56 @@ CONSENSUS_NO = '-y 0 -z 0 '
CONSENSUS_ALL = '-y all -z 0 '
DESIGNATIONS = '-c designations '
BENCHS = {
'init': '-x 1 ' + MULTI + L4 + CONSENSUS_NO,
'germ': LIMIT1e5 + MULTI + L4 + '-c germlines ',
from collections import OrderedDict
'multi-0': LIMIT1e5 + MULTI + L4 + CONSENSUS_NO,
'multi-1': LIMIT1e5 + MULTI + L4 + CONSENSUS_ALL,
'multi-a': LIMIT1e3 + MULTI + L4 + DESIGNATIONS + '-z 1000',
BENCHS = OrderedDict([
('init', '-x 1 ' + MULTI + L4 + CONSENSUS_NO),
('germ', LIMIT1e5 + MULTI + L4 + '-c germlines '),
'igh-0': LIMIT1e5 + IGH + S22 + CONSENSUS_NO,
'igh-1': LIMIT1e5 + IGH + S22 + CONSENSUS_ALL,
'igh-a': LIMIT1e3 + IGH + S22 + DESIGNATIONS,
}
('multi-0', LIMIT1e5 + MULTI + L4 + CONSENSUS_NO),
('multi-1', LIMIT1e5 + MULTI + L4 + CONSENSUS_ALL),
('multi-a', LIMIT1e3 + MULTI + L4 + DESIGNATIONS + '-z 1000'),
('igh-0', LIMIT1e5 + IGH + S22 + CONSENSUS_NO),
('igh-1', LIMIT1e5 + IGH + S22 + CONSENSUS_ALL),
('igh-a', LIMIT1e3 + IGH + S22 + DESIGNATIONS),
])
COMPATIBILITY = [
('2019.03', '-c designations', '-c segment'),
]
# Notable changes that may affect speed/memory
INFOS = {
'2019.03': 'Aho by default',
'2018.07': '--analysis-filter (always 3)',
'2018.10': '--analysis-filter 1',
'2020.04': '#4287',
}
# Simple colored output
CSIm = '\033[%sm'
class ANSI:
RESET = 0
BRIGHT = 1
BLACK = 30
RED = 31
GREEN = 32
YELLOW = 33
BLUE = 34
MAGENTA = 35