Commit a710c4ae authored by Mathieu Giraud's avatar Mathieu Giraud

merge: threshold for the FineSegmenter

We check that the score of the V/J regions is at least the one of 10 matches.
This range of commits further prepares the code for an actual p-value computation.
parents 7e632481 d194a558
......@@ -23,6 +23,7 @@
#include "dynprog.h"
#include "tools.h"
#include "segment.h"
#include <cassert>
#include <list>
#include <cstdlib>
......@@ -98,6 +99,12 @@ int Cost::homo2(char xa, char xb, char y)
return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}
double Cost::toPValue(int score)
{
// TODO: compute an actual p-value
return (score <= MIN_MATCHES * match) ? BAD_EVALUE : 1 / (double) score ;
}
DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
......
......@@ -46,6 +46,11 @@ class Cost
int substitution(char a, char b);
int homo2(char xa, char xb, char y);
/**
* @return p-value of having a random alignment of the given score
*/
double toPValue(int score);
int open_insertion;
int open_deletion;
int extend_insertion;
......
......@@ -79,6 +79,25 @@ bool Segmenter::isDSegmented() const {
return dSegmented;
}
// E-values
void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
if (threshold == NO_LIMIT_VALUE)
return ;
if (evalue_left >= threshold && evalue_right >= threshold)
because = UNSEG_TOO_FEW_ZERO ;
else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
because = UNSEG_TOO_FEW_V ;
else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
because = UNSEG_TOO_FEW_J ;
else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
because = UNSEG_TOO_FEW_ZERO ;
}
// Chevauchement
string Segmenter::removeChevauchement()
......@@ -427,26 +446,16 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
}
// E-values
pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
evalue_left = pvalues.first * multiplier ;
evalue_right = pvalues.second * multiplier ;
evalue = evalue_left + evalue_right ;
// E-value threshold
if (threshold > NO_LIMIT_VALUE && evalue >= threshold) {
// Detail the unsegmentation cause
if (evalue_left >= threshold && evalue_right >= threshold)
because = UNSEG_TOO_FEW_ZERO ;
else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
because = UNSEG_TOO_FEW_V ;
else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
because = UNSEG_TOO_FEW_J ;
else // left and right are <= threshold, but their sum is > threshold
because = UNSEG_TOO_FEW_ZERO ;
checkLeftRightEvaluesThreshold(threshold, strand);
if (because != NOT_PROCESSED)
return ;
}
// There was a good segmentation point
......@@ -458,8 +467,6 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
Jstart = tmp;
}
assert(because == NOT_PROCESSED);
// Yes, it is segmented
segmented = true;
reversed = (strand == -1);
......@@ -621,7 +628,7 @@ string format_del(int deletions)
return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, int multiplier)
{
segmented = false;
dSegmented = false;
......@@ -731,33 +738,39 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
score_J=score_minus_J;
}
segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) &&
(Jstart - Vend >= germline->delta_min);
if (!segmented)
/* E-values */
evalue_left = multiplier * segment_cost.toPValue(score_V[0].first);
evalue_right = multiplier * segment_cost.toPValue(score_J[0].first);
evalue = evalue_left + evalue_right ;
/* Unsegmentation causes */
if (Jstart - Vend < germline->delta_min)
{
because = NOT_PROCESSED;
info = " @" + string_of_int (Vend + FIRST_POS) + " @" + string_of_int(Jstart + FIRST_POS) ;
if (Jstart - Vend < germline->delta_min)
{
because = UNSEG_BAD_DELTA_MIN ;
}
because = UNSEG_BAD_DELTA_MIN ;
}
if (Vend == (int) string::npos)
{
because = UNSEG_TOO_FEW_V ;
}
if (Jstart == (int) string::npos)
{
because = UNSEG_TOO_FEW_J ;
}
if (Vend == (int) string::npos)
{
evalue_left = BAD_EVALUE ;
}
if (Jstart == (int) string::npos)
{
evalue_right = BAD_EVALUE ;
}
checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);
if (because != NOT_PROCESSED)
{
segmented = false;
info = " @" + string_of_int (Vend + FIRST_POS) + " @" + string_of_int(Jstart + FIRST_POS) ;
return ;
}
because = reversed ? SEG_MINUS : SEG_PLUS ;
/* The sequence is segmented */
segmented = true ;
because = reversed ? SEG_MINUS : SEG_PLUS ;
//overlap VJ
if(Jstart-Vend <=0){
......
......@@ -16,6 +16,7 @@
#define EXTEND_D_ZONE 5
#define MIN_D_LENGTH 5 /* If a D-REGION is smaller than this threshold, it is not output */
#define MIN_MATCHES 10 /* If a V/J-REGION does not give an alignment score with at least this number of matches, the FineSegmenter does not segment the sequence */
#define RATIO_STRAND 2 /* The ratio between the affectations in one
strand and the other, to safely attribute a
......@@ -32,6 +33,7 @@
#define JSON_REMEMBER_BEST 4 /* The number of V/D/J predictions to keep */
#define NO_LIMIT_VALUE -1
#define BAD_EVALUE 1e10
#define THRESHOLD_NB_EXPECTED 1.0 /* Threshold of the accepted expected value for number of found k-mers */
......@@ -67,6 +69,12 @@ protected:
bool reversed, segmented, dSegmented;
int because;
/**
* Compares evalue_left, evalue_right and evalue against the provided threshold
* @post some evalue is above the threshold ==> because is set to UNSEG_TOO_FEW_ZERO, UNSEG_TOO_FEW_V or UNSEG_TOO_FEW_J
*/
void checkLeftRightEvaluesThreshold(double threshold, int strand);
string removeChevauchement();
bool finishSegmentation();
bool finishSegmentationD();
......@@ -239,7 +247,8 @@ class FineSegmenter : public Segmenter
* @param seq: An object read from a FASTA/FASTQ file
* @param germline: germline used
*/
FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost);
FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost,
double threshold = THRESHOLD_NB_EXPECTED, int multiplier=1);
/**
* extend segmentation from VJ to VDJ
......
!LAUNCH: ../../vidjil -c segment -g ../../germline ../../data/multi-tiny.fa
$ Do not segment any of the seven reads
7:UNSEG
!LAUNCH: ../../vidjil -r 1 -k 4 -w 20 -c clones -V ../../data/toy_V.fa -J ../../data/toy_J.fa ../../data/no_representative.fa
!LAUNCH: ../../vidjil -r 1 -k 4 -w 20 -z 0 -c clones -V ../../data/toy_V.fa -J ../../data/toy_J.fa ../../data/no_representative.fa
$ Short reads properly segmented
1:SEG_+.* 4
1:SEG_+.* -> .* 4
$ Window found
1:^CACCCCCCCCCTTTTTTTCT$
2:^CACCCCCCCCCTTTTTTTCT$
$ Representative computation
1:>clone-001--custom--0000004--100%--seq.-\\[11,30\\]
>TRDD2*01_0//0_TRDJ1*01 TRD+
TATACTGATGTGTTTCATTGTG
ccttcctac
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
>TRDD2*01_0//0_TRDD3*01 TRD+
TATACTGATGTGTTTCATTGTG
ccttcctac
actgggggatacg
CACAGTGCTACAAAACCTACAGAGACCTGTAC
>TRDV3*01_0//0_TRDD3*01 TRD+
atctctccagtaaggactgaagacagtgccacttactactgtgcctttag
actgggggatacg
CACAGTGCTACAAAACCTACAGAGACCTGTAC
!LAUNCH: $LAUNCHER ../../vidjil -z 1 -k 9 -G ../../germline/IGH -% 0.001 -r 2 -x 1000 -y 1 -c clones ../../data/Stanford_S22.fasta | sed 's/--IGH--.*VDJ\\(.*\\).$/\\1/' > vidjil_s22.log && $LAUNCHER ../../vidjil -z 1 -k 9 -G ../../germline/IGH -% 0.001 -r 2 -x 1000 -y 1 -c clones ../../data/Stanford_S22.rc.fasta | sed 's/--IGH--.*VDJ\\(.*\\).$/\\1/' > vidjil_s22_rc.log && diff vidjil_s22.log vidjil_s22_rc.log
!LAUNCH: $LAUNCHER ../../vidjil -z 1 -k 9 -G ../../germline/IGH -% 0.001 -r 2 -x 1000 -y 1 -c clones ../../data/Stanford_S22.fasta | sed 's/--IGH--.*VDJ\\(.*\\).$/\\1/' | sed 's/IGH SEG_./IGH SEG_X/' > vidjil_s22.log && $LAUNCHER ../../vidjil -z 1 -k 9 -G ../../germline/IGH -% 0.001 -r 2 -x 1000 -y 1 -c clones ../../data/Stanford_S22.rc.fasta | sed 's/--IGH--.*VDJ\\(.*\\).$/\\1/' | sed 's/IGH SEG_./IGH SEG_X/' > vidjil_s22_rc.log && diff vidjil_s22.log vidjil_s22_rc.log
!EXIT_CODE: 1
$ Same number segmented
......
......@@ -1474,7 +1474,7 @@ int main (int argc, char **argv)
KmerSegmenter *seg = kmseg.the_kseg ;
Germline *germline = seg->segmented_germline ;
FineSegmenter s(seq, germline, segment_cost);
FineSegmenter s(seq, germline, segment_cost, expected_value, nb_reads_for_evalue);
if (s.isSegmented())
{
......
>TRA--V1-1*01--J1*01
ctgtgagaga
gtatgaaagt
>TRB--V1*01--D1*01--J1-1*01
gcagccaaga
gggacagggggc
tgaacactga
>TRG--V1*01--J1*01
ctgggacagg
gaattattat
>TRD--V1*01--D1*01--J1*01
ttggggaact
gaaatagt
acaccgataa
>IGH--V1-18*01--D1-1*01--J1*01
gtgcgagaga
ggtacaactggaacgac
gctgaatact
>IGK--V1-12*01--J1*01
gtttccctcc
gtggacgttc
>IGL--V1-36*01--J1*01
tgaatggtcc
ttatgtcttc
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment