Commit 34bbfe5e authored by flothoni's avatar flothoni
Browse files

Merge branch 'feature-c/4034-genescan-cluster' of...

Merge branch 'feature-c/4034-genescan-cluster' of gitlab.inria.fr:vidjil/vidjil into feature-c/4034-genescan-cluster
parents df2098c2 163f7b53
Pipeline #150413 failed with stages
in 9 seconds
......@@ -23,6 +23,8 @@ profiling_algo:
- vidjil.cpu
- vidjil.mem.*
when: manual
only:
- /^feature-.*a.*\/.*$/
tags:
- cidocker
#ifndef AUTOMATON_H
#define AUTOMATON_H
#include <list>
#include <vector>
#include <cctype>
#include "kmerstore.h"
#include <cstdlib>
......@@ -53,7 +53,7 @@ public:
/**
* @return the information stored for this state
*/
virtual list<Info> &getInfo(void *state) = 0;
virtual vector<Info> &getInfo(void *state) = 0;
/**
* @param starting_state: the starting state for traversing the automate.
......@@ -104,7 +104,7 @@ class pointer_state {
public:
pointer_state<Info> *transitions[NB_TRANSITIONS]; /* Transitions to the 5 nt */
bool is_final;
list<Info> informations; /* != NULL when is_final */
vector<Info> informations; /* != NULL when is_final */
pointer_state():is_final(false),informations() {
for (size_t i = 0; i < NB_TRANSITIONS; i++)
......@@ -164,7 +164,7 @@ public:
/**
* @return the information stored for this state
*/
list<Info> &getInfo(void *state);
vector<Info> &getInfo(void *state);
/**
* @return the automaton initial state
......
......@@ -4,6 +4,7 @@
#include "automaton.h"
#include <stack>
#include <set>
#include <list>
//////////////////// IMPLEMENTATIONS ////////////////////
template <class Info>
......@@ -160,7 +161,7 @@ void PointerACAutomaton<Info>::build_failure_functions() {
}
template <class Info>
list<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
vector<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
return ((pointer_state<Info> *)state)->informations;
}
......
......@@ -178,7 +178,7 @@ namespace std {
template <>
struct hash<KmerAffect> {
size_t operator()(const KmerAffect &affect) const {
return (affect.getLabel()[0] << 8) | affect.getLength();
return (((unsigned char) affect.affect.c << 8) | (affect.getLength()));
}
};
}
......
......@@ -46,6 +46,10 @@ string Kmer::getLabel() const{
return "";
}
int Kmer::getStrand() const{
return 1;
}
size_t Kmer::getLength() const{
return 10;
}
......
......@@ -9,13 +9,13 @@
#include <math.h>
#include "bioreader.hpp"
#include "tools.h"
#include "proba.h"
using namespace std;
typedef
enum { KMER_INDEX, AC_AUTOMATON } IndexTypes;
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
class Kmer {
public:
unsigned int count;
......@@ -57,6 +57,11 @@ public:
* it only there for a reason of compatibility with KmerAffect)
*/
bool isAmbiguous() const;
/**
* @return 1 (only there for compatibility reasons with KmerAffect)
*/
int getStrand() const;
} ;
ostream &operator<<(ostream &os, const Kmer &kmer);
bool operator==(const Kmer &k1, const Kmer &k2);
......@@ -80,9 +85,8 @@ protected:
size_t nb_kmers_inserted;
size_t max_size_indexing;
bool finished_building;
map<float, vector<double> > precomputed_proba_with_system,
precomputed_proba_without_system;
ProbaPrecomputer proba;
public:
virtual ~IKmerStore();
......@@ -377,61 +381,20 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
return max_size_indexing ;
}
template<class T>
void IKmerStore<T>::precompute_proba(float index_load) {
precomputed_proba_with_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
vector<double> &pproba_with = precomputed_proba_with_system[index_load];
vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad(kmer) ;
if (! precomputed_proba_without_system.count(index_load)) {
precompute_proba(index_load);
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
if (precomputed_proba_with_system.at(index_load).size() > (size_t)at_least)
probability_having_system = precomputed_proba_with_system.at(index_load)[at_least];
else
probability_having_system = pow(index_load, at_least);
if (precomputed_proba_without_system.at(index_load).size() > (size_t)n - at_least)
probability_not_having_system = precomputed_proba_without_system.at(index_load)[n-at_least];
else
probability_not_having_system = pow(1 - index_load, n - at_least);
for (int i=at_least; i<=n; i++) {
proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
probability_having_system *= index_load;
probability_not_having_system /= (1 - index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
int kmer_length = kmer.getLength();
if (kmer.isUnknown() || kmer.isAmbiguous() || kmer == AFFECT_NOT_UNKNOWN
|| kmer.getLength() == ~0)
kmer_length = getS();
int n = max(0, length - kmer_length + 1);
at_least = min(at_least, n);
return proba.getProba(index_load, at_least, n);
}
template<class T>
int IKmerStore<T>::getK() const {
return k;
......
#include "proba.h"
#include <math.h>
#include "tools.h"
// ProbaPrecomputer::ProbaPrecomputer() {}
void ProbaPrecomputer::precomputeSystemProba(float index_load) {
precomputed_proba_with_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
std::vector<double> &pproba_with = precomputed_proba_with_system[index_load];
std::vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
precomputed_proba[index_load] = std::vector<std::vector<double> >(MAX_PRECOMPUTED_PROBA);
}
void ProbaPrecomputer::precomputeProba(float index_load, int length) {
if (length < MAX_PRECOMPUTED_PROBA) {
// By definition of MAX_PRECOMPUTED_PROBA they exist
double probability_having_system = precomputed_proba_with_system.at(index_load)[length];
double probability_not_having_system = 1;
precomputed_proba[index_load][length] = std::vector<double>(length+1);
std::vector<double> &precomp_proba = precomputed_proba[index_load][length];
double proba = 0;
precomp_proba[0] = proba;
double Cnk = 1; // nChoosek(length, length);
for (int i=length; i>=1; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
precomp_proba[i] = proba;
}
}
}
double ProbaPrecomputer::probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load) {
double proba = Cnk * proba_with * proba_without;
proba_with = getProbaWith(index_load, iteration-1); // Otherwise when probability are too low they are stored as 0 and we are f*****
proba_without *= (1 - index_load);
Cnk *= iteration*1. / (length-iteration+1);
return proba;
}
double ProbaPrecomputer::getProbaWith(double p, int nb) {
if (nb < MAX_PRECOMPUTED_PROBA)
return precomputed_proba_with_system.at(p)[nb];
return pow(p, nb);
}
double ProbaPrecomputer::getProba(float index_load, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
if (! precomputed_proba_without_system.count(index_load)) {
precomputeSystemProba(index_load);
}
if (length < MAX_PRECOMPUTED_PROBA) {
if (! precomputed_proba[index_load][length].size())
precomputeProba(index_load, length);
if (precomputed_proba[index_load][length].size() > 0) {
return precomputed_proba[index_load][length][at_least];
}
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
probability_having_system = pow(index_load, length);
probability_not_having_system = 1;
double Cnk = 1;
for (int i=length; i >= at_least; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
}
#ifndef PROBA_H
#define PROBA_H
#include <map>
#include <vector>
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
/**
* Store probabilities for sequences whose length is < MAX_PRECOMPUTED_PROBA
*/
class ProbaPrecomputer {
private:
// Store for each index load, the probability that have been computed for
// each possiible at_least and length values. To be used as
// precomputed_proba[index_load][length][at_least]
std::map<float, std::vector<std::vector<double> > > precomputed_proba;
std::map<float, std::vector<double> > precomputed_proba_with_system, precomputed_proba_without_system;
/**
* Precompute index_load^n and (1-index_load)^n (for n=0...MAX_PRECOMPUTED_PROBA)
*/
void precomputeSystemProba(float index_load);
/**
* Precompute the results of getProba (if length < MAX_PRECOMPUTED_PROBA)
* for every possible values of at_least between 1 and length
*/
void precomputeProba(float index_load, int length);
/**
* Commpute the previous iteration for the probability (and updates the values Cnk, proba_with,
* proba_without.
*/
inline double probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load);
public:
/**
* @return probability that the number of kmers (with index load 'index_load')
* is 'at_least' or more in a sequence of length 'length'
*/
double getProba(float index_load, int at_least, int length);
/**
* @return p^nb (potentially pre-computed)
*/
double getProbaWith(double p, int nb);
friend void testProba1();
};
#endif
......@@ -895,12 +895,9 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
if (local==true){
dp.backtrack();
}
if (score > best_score)
{
{
dp.backtrack();
best_score = score ;
best_best_i = dp.best_i ;
best_best_j = dp.best_j ;
......@@ -908,9 +905,6 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
best_first_j = dp.first_j ;
box->ref_nb = r ;
box->ref_label = rep.label(r) ;
if (!local)
dp.backtrack();
box->marked_pos = dp.marked_pos_i ;
}
......
......@@ -302,7 +302,7 @@ def header_igblast_results(ff_fasta, ff_igblast):
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c designations -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.detected.vdj.fa out/out.undetected.vdj.fa >> %s'
def should_results_from_vidjil_output(f_log):
'''
......
......@@ -12,10 +12,10 @@ $ Find the good statistics on TRG
$ Find the good length statistics
1: SEG -> .* 84.0
$ Four segmented on forward
$ Four detected on forward
1:SEG_\+ -> 4
$ Three segmented on reverse (Junc#01 doesn't have its rc)
$ Three detected on reverse (Junc#01 doesn't have its rc)
1:SEG_- -> 3
$ No too short window
......
!LAUNCH: $VIDJIL_DIR/$EXEC -r 1 -k 4 -w 20 -z 0 -c clones -V $VIDJIL_DATA/toy_V.fa -J $VIDJIL_DATA/toy_J.fa $VIDJIL_DATA/ambiguous_representative.fa
$ Short reads properly segmented
$ Short reads properly detected
1:SEG_+.* -> .* 4
$ Window found
......
!LAUNCH: $VIDJIL_DIR/$EXEC --header-sep FA -k 16 -z 0 -w 60 -r 5 -o out2 -uuu -U -v -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa $VIDJIL_DATA/Stanford_S22.fasta ; tail out2/Stanford_S22.segmented.vdj.fa ; grep UNSEG out2/Stanford_S22.unsegmented.vdj.fa
!LAUNCH: $VIDJIL_DIR/$EXEC --header-sep FA -k 16 -z 0 -w 60 -r 5 -o out2 -uuu -U -v -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa $VIDJIL_DATA/Stanford_S22.fasta ; tail out2/Stanford_S22.detected.vdj.fa ; grep UNSEG out2/Stanford_S22.undetected.vdj.fa
# Testing uncommon and debug options
$ verbose (-v)
2:auditioned sequences
$ segmented.fa (-U)
$ detected.fa (-U)
2:>lcl.FLN1.* VJ .* seed custom SEG_[+]
$ unsegmented.fa (-u)
$ undetected.fa (-u)
3:>lcl.FLN1.* UNSEG
$ fasta header stops before FA
......
......@@ -11,5 +11,5 @@ $ With default seeds the J is not found as it is too short
$ With different seeds we don't have the same seed for the IGHV and IGHJ
1: IGH .* ######-###### .* ########
$ The sequence is segmented
$ The sequence is detected
1: SEG -> 1
......@@ -9,7 +9,7 @@ $ From homo-sapiens.g
$ Number of reads
1:"total": \[13153\]
$ Number of segmented reads
$ Number of detected reads
1:"segmented": \[13153\]
$ Most abundant window
......
......@@ -3,13 +3,13 @@
!LAUNCH: diff out/Stanford_S22{,.rc}.vidjil | grep GGG && diff vidjil_s22.log vidjil_s22_rc.log
!EXIT_CODE: 1
$ Same number segmented
0:==> segmented
$ Same number detected
0:==> detected
$ Same number of windows found
0:==> found
$ Same number unsegmented
$ Same number undetected
0:UNSEG.*->
$ Keep the same number of windows
......
......@@ -6,7 +6,7 @@ $ Just one window found
$ Just one clone found
1:==> 1 clones
$ Two reads segmented on TRA+D
$ Two reads detected on TRA+D
1:TRA.D -> 2
$ One clone for the two reads, Vd1/Ja29
......
......@@ -3,6 +3,6 @@
$ Segment 6 reads, thanks to -i
1:junction detected in 6 reads
$ 4 reads are segmented on TRD+ (with -w 10, otherwise the reads are too short)
$ 4 reads are detected on TRD+ (with -w 10, otherwise the reads are too short)
1: TRD[+] .*->.* 4
!LAUNCH: $VIDJIL_DIR/$EXEC -y 0 --trim 1 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH -x 100 $VIDJIL_DATA/Stanford_S22.fasta
$ No read segmented as we have no germline because of the --trim
$ No read detected as we have no germline because of the --trim
1: UNSEG too few V/J -> 100
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline/homo-sapiens.g:TRA,TRB,TRD,TRG,IGH,IGK,IGL -uuu $VIDJIL_DATA/segmentation-2.fa ; cat out/segmentation-2.unsegmented.vdj.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline/homo-sapiens.g:TRA,TRB,TRD,TRG,IGH,IGK,IGL -uuu $VIDJIL_DATA/segmentation-2.fa ; cat out/segmentation-2.undetected.vdj.fa
$ Only one sequence is segmented, but it is too small for a window (too short w)
$ Only one sequence is detected, but it is too small for a window (too short w)
1: junction detected in 1 reads
1: found 0 windows in 0 reads
......@@ -13,7 +13,7 @@ $ The proper unsegmentation cause is given
1: UNSEG ambiguous -> .* 1
1: UNSEG too short w -> .* 1
$ The proper unsegmentation cause is given in the .unsegmented.vdj.fa file (-uuu)
$ The proper cause is given in the .undetected.vdj.fa file (-uuu)
1: >too_short .* UNSEG too short
1: >strand .* UNSEG strand
3: >too_few_vj-..* UNSEG too few V/J
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment