Commit da83fb01 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

Merge branch 'feature-a/3594-updated-evalue-computations-kmer-fine' into 'dev'

Update e-value multipliers and options

Closes #3594

See merge request !538
parents 4a8d91df d891b6dd
Pipeline #115518 passed with stages
in 11 minutes and 10 seconds
# exact sequences
>TRDV1--DD3+d-exact
ttacagctagaagattcagcaaagtacttttgtgctc
CACAGTGCTACAAAACCTACAGAGACCTGTACAAA
# 3 mutations, no 9-mer
>TRDV1--DD3+d-no-9-mer
ttacagctagaagattcagcaaagtacttttgtgctc
CACAGTGCaACAAAACCcACAGAGACgTGTACAAA
# X X X
# 3 mutations, only one 9-mer
>TRDV1--DD3+d-one-9-mer
ttacagctagaagattcagcaaagtacttttgtgctc
CACAGTGCaACAAAACCTcCAGAGACCaGTACAAA
# X====9====X X
# random J sequence but with one spurious 9-mer
>TRDV1--random
ttacagctagaagattcagcaaagtacttttgtgctc
gctgatcataACAAAACCTctgtcagtcatgagctg
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -3 -E 1.0 -g $VIDJIL_DIR/germline ../should-vdj-tests/Demo-X5.should-vdj.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -3 -g $VIDJIL_DIR/germline ../should-vdj-tests/Demo-X5.should-vdj.fa
$ Detects a CDR3 on regular V(D)J recombinations
1: IGH SEG.* [{].*[}]
......
......@@ -3,7 +3,7 @@
# We launch three times a sequence of interest (buggy-D.fa) with a various
# number of distinct reads (mutations randomly inserted in the middle).
# The segmentation of one sequence should not depend on the number of the
# The designation of one sequence should not depend on the number of the
# other reads. This is what is tested, we first put 10 sequences, then 5 and
# finally just the sequence of interest alone.
......@@ -14,5 +14,5 @@
$ Three times the same window
3: TGTGCGGGATCTTCGTCCTCTTATCATAATAATGGTTTTTTGGCGGGGGAGTCATGGGGC
$ Three times the same segmentation
f3: IGHV4-34.*IGHD.*IGHJ4
$ Three times the same designation
3: IGHV4-34.*IGHD.*IGHJ4
### Testing -E (e-value threshold for D detection)
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations --first-reads 1 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRB ../should-vdj-tests/0000-nck-TRB.should-vdj.fa
$ Default -E value, no D here
1: TRBV6-1.* 7/AGGTGAGTCCC/2 TRBJ2-7
0: TRBD1
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations --first-reads 1 -e 1e6 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRB ../should-vdj-tests/0000-nck-TRB.should-vdj.fa
$ A large -e has no consequence on the default -E value, still no D here
1: TRBV6-1.* 7/AGGTGAGTCCC/2 TRBJ2-7
0: TRBD1
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations --first-reads 1 -E 100 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRB ../should-vdj-tests/0000-nck-TRB.should-vdj.fa
$ With -E 100, a D is detected
1: TRBV6-1.* TRBD1.* TRBJ2-7
###
!LAUNCH: $VIDJIL_DIR/$EXEC --all -g $VIDJIL_DIR/germline/homo-sapiens.g:TRD+ $VIDJIL_DATA/hard-kmers.fa
$ Detect 1 read
1:junction detected in 1 read
$ Detect the exact recombination
1:>clone.*exact
### -e-value 1e12
!LAUNCH: $VIDJIL_DIR/$EXEC --all --e-value 1e12 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRD+ $VIDJIL_DATA/hard-kmers.fa
$ Detect three sequences
1:junction detected in 3 reads
$ Detect the exact recombination
1:>clone.*exact
$ Detect the recombination with only one k-mer
1:>clone.*one-9-mer
$ Sadly, detect and designate the read with a random J but with a spurious k-mer
1:>clone.*random
1:>clone.*random.*TRDV1.* 10/.*/22 TRDD3
### --e-value-kmer 1e12
!LAUNCH: $VIDJIL_DIR/$EXEC --all --e-value-kmer 1e12 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRD+ $VIDJIL_DATA/hard-kmers.fa
$ Detect three sequences
1:junction detected in 3 reads
$ Detect the exact recombination
1:>clone.*exact
$ Detect the recombination with only one k-mer
1:>clone.*one-9-mer
$ Sadly, detect the read with a random J but with a spurious k-mer
1:>clone.*random
$ but... do not designate it, as the global --e-value is still the default 1.0
1:>clone.*random.*UNSEG
### --seed 12s
!LAUNCH: $VIDJIL_DIR/$EXEC -K --all --seed 12s -g $VIDJIL_DIR/germline/homo-sapiens.g:TRD+ $VIDJIL_DATA/hard-kmers.fa
$ With another seed, detect three sequences
1:junction detected in 3 reads
$ Do not detect the read with a random J
0:>clone.*random
......@@ -15,7 +15,7 @@ $ Do not display advanced options
0: custom Cost
$ Correct number of regular options
24:^..-
25:^..-
!NO_LAUNCHER:
......@@ -37,4 +37,4 @@ $ Display advanced options
: custom Cost
$ Correct number of options
54:^..-
55:^..-
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa ; cat out/multi-complete.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
### Focusing on Ig recombinations with -g:IGH,IGK,IGL
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -e 0.1 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa ; cat out/multi-complete.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ Segment the Ig recombinations
$ Detect the Ig recombinations
1:IGH SEG
1:IGK SEG
1:IGL SEG
$ Do not segment the TR recombinations
$ Do not detect the TR recombinations
4:TR.* UNSEG
$ Report the unsegmented sequences in the json output
$ Report the undetected sequences in the json output
1: "germline": "not analyzed", "id": "000001", "name": "TRA
1: "germline": "not analyzed", "id": "000002", "name": "TRB
1: "germline": "not analyzed", "id": "000003", "name": "TRG
1: "germline": "not analyzed", "id": "000004", "name": "TRD
$ Count the unsegmented sequences in the json output
$ Count the undetected sequences in the json output
1: "not analyzed": .4.
$ Count the segmented sequences in the json output
$ Count the detected sequences in the json output
1: "segmented": .3.
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -e 0.1 -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-tiny.fa
$ Do not designate recombinations in these reads, they are too small
7:UNSEG
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-complete.fa
$ Segment all the seven reads
$ Detect recombinations in all the seven reads
1:junction detected in 7 reads
$ Segment one read on TRA
$ Detect one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
$ Detect one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
$ Detect one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
$ Detect one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
$ Detect one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
$ Detect one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
$ Detect one read on IGL
1:IGL .* -> .* 1
$ Compute the diversity. All windows have only one read, full diversity.
......@@ -29,31 +29,49 @@ $ Compute the diversity. All windows have only one read, full diversity.
1: E = 1.000
1: Ds = 1.000
### Focusing on Ig recombinations with -g:IGH,IGK,IGL
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa
$ Detect the Ig recombinations
1:IGH .* -> .* 1
1:IGK .* -> .* 1
1:IGL .* -> .* 1
$ Do not detect the TR recombinations
1:junction detected in 3 reads
### All loci, but shorter sequences
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-short.fa
$ Segment all the seven reads
$ Detect recombinations in all the seven reads
1:junction detected in 7 reads
$ Segment one read on TRA
$ Detect one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
$ Detect one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
$ Detect one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
$ Detect one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
$ Detect one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
$ Detect one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
$ Detect one read on IGL
1:IGL .* -> .* 1
### Even shorter sequences
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-tiny.fa
$ Do not detect recombinations in any of the seven reads, they are too small
1:UNSEG too few .* -> .* 7
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-tiny.fa
$ Do not segment any of the seven reads
7:UNSEG
......@@ -14,7 +14,7 @@ nnnCAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTA
CAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCTTGGGGAACTAAATTTCGTGATTTGGGGGATACCTTGGGACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGAAC
# A short Dd1 could also be inserted after the TRDV1
>TRDV1*01 10/CTCCCGCAAAATTCA/2 TRDD2*01 0/TGCC/5 TRDD3*01 3/GCACTTTTCCTT/27 TRDJ1*01 [TRD] BUG
>TRDV1*01 10/CTCCCGCAAAATTCA/2 TRDD2*01 0/TGCC/5 TRDD3*01 3/GCACTTTTCCTT/27 TRDJ1*01 [TRD]
nnCAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCCTCCCGCAAAATTCATTCCTACTGCCGGGATGCACTTTTCCTTGAACCCGTGTGACTGTGGAACANNNNNN
>TRDV1*01 1/C/0 TRDD2*01 2//1 TRDD3*01 1/AGGGT/0 TRDJ1*01 [TRD]
......@@ -24,8 +24,8 @@ CAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAA
>TRDV1*01 2/ATGG/4 TRDD2*01 0/AGGA/0 TRDD3*01 5/TCCCTTTGT/0 TRDJ1*01 [TRD] TODO
CAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCTTGGGGAAATGGCCTACAGGAACTGGGGGTCCCTTTGTACACCGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGAAC
>TRDV1*01 0/T/1 TRDD1*01 2/CCTTAGGGATGGTCGAGGA/2 TRDD3*01 6//4 TRDJ1*01 [TRD] BUG
>TRDV1*01 0/T/1 TRDD1*01 2/CCTTAGGGATGGTCGAGGA/2 TRDD3*01 6//4 TRDJ1*01 [TRD]
ACAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCTTGGGGAACTTAAATACCTTAGGGATGGTCGAGGATGGGGCGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGAAC
>TRDV1*01 1/AATCACCTCTT/1 TRDD2*01 3/CCACGAAAACAT/0 TRDD3*01 2//9 TRDJ1*01 [TRD] BUG
>TRDV1*01 1/AATCACCTCTT/1 TRDD2*01 3/CCACGAAAACAT/0 TRDD3*01 2//9 TRDJ1*01 [TRD]
CAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCTTGGGGAACAATCACCTCTTCTTCCCCACGAAAACATACTGGGGGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGAAC
>TRDV1*01 0/0/3 TRDD3 4/11/0 TRDJ1*01 [TRD] BUG
>TRDV1*01 0/0/3 TRDD3 4/11/0 TRDJ1*01 [TRD]
CAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAGCAAAGTACTTTTGTGCTCTTGGGGAACTAGGGGACTGGTCCCCTTACACCGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGAAc
>TRAV29/DV5*01 0/9/0 TRDD3 3/9/2 TRDJ1*01 [TRD]
......
#This sequence may be problematic due to a long deletion ( > 100 nt)
#IGHV4-59*01 has as many errors as IGHV4-59*03 but is longer hence less desirable
#but could be ok too
>IGHV4-59*03 (116/GACGC/5, 119/GCAGACGC/5) IGHD5-12*01 2/CTAGGCTGT/4 IGHJ4*02 [IGH]
>IGHV4-59*03 (116/GACGC/5, 119/GCAGACGC/5) IGHD5-12*01 2/CTAGGCTGT/4 IGHJ4*02 [IGH] BUG
AGGTGCAGCTGGTGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGGATTTAGTACTGGAGCTGGATCCGGCAGCCCCCAGGGAGGGGACTGGAGTGGATTGATTAGATCCACTACAGTGGGAGCA
GCAGACGCTATAGGGACTACGATTCTAGGCTGT
CTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGTAAG
......
......@@ -2,6 +2,9 @@
# ||||||||||||||||||||||||||||||||||||||||
# read GGAGGTCCCTAAGACTCCCCTGTGCAGCCTCTGGATTCACTGGTCACCGTCTCCTCAGGTAAG
# |||||||||||||||||||
# IGHJ1*01 GCTGAATACTTCCAGCACTGGGGCCAGGGCACCCTGGTCACCGTCTCCTCAG
# IGHJ4 ACTACTTTGACTACTGGGGCCAAGGAACCCTGGTCACCGTCTCCTCAG
>IGHV3-30 (213//30, 214//29) IGHJ4 [IGH] BUG
# IGHJ5*02 ACAACTGGTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG
# IGHJ5*01 ACAACTGGTTCGACTCCTGGGGCCAAGGAACCCTGGTCACCGTCTCCTCAG
>(IGHV3-30, IGHV3-30-3, IGHV3-30-5) (213//34 IGHJ1*01, 214//33 IGHJ1*01, 213//30 IGHJ4, 214//29 IGHJ4, 213//33 IGHJ5, 214//32 IGHJ5) [IGH]
CACAGGGGTCCAGTGTCAGGTGTAGGTGGTGGAATCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTAAGACTCCCCTGTGCAGCCTCTGGATTCACTGGTCACCGTCTCCTCAGGTAAG
......@@ -36,7 +36,7 @@ GATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCCTCAGGT
#target 0808TD
>IGHV2-5*01 0/GTAAAGGCGAACTTAC/16 IGHD2-2*01 7/AGATT/10 IGHJ4*02 [IGH] BUG
>IGHV2-5*01 0/GTAAAGGCGAACTTAC/16 IGHD2-2*01 7/AGATT/10 IGHJ4*02 [IGH]
GTAAAACGACGGCCAGTTCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCA
CGCTGACCTGCACCTTCTCTGGGTTCTCACTCAGCACTAGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCC
CAGGAAAGGCCCTGGAGTGGCTTGCACTCATTTATTGGAATGATGATAAGCGCTACAGCCCATCTCTGAAGA
......@@ -255,7 +255,7 @@ AGCCCTATAGTGAGTCGTATTA
#target 1202LH
>IGHV3-15*01 13/TTTTCCCC/15 IGHD3-22*01 8/GCCC/4 IGHJ4*02 [IGH] BUG
>IGHV3-15*01 13/TTTTCCCC/15 IGHD3-22*01 8/GCCC/4 IGHJ4*02 [IGH]
CGCCTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTATTA
AAAGCAAAACTGATGGTGGGACAACAGACTACGCTGCACCCGTGAAAGGCAGATTCACCATC
TCAAGAGATGATTCAAAAAACACGCTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACAC
......
......@@ -214,7 +214,7 @@ TTTCCCTGGG
#target 1119JC
>TRDV2 7/CAGGG/1 TRDD3 4/GCCC/4 TRAJ29 [TRA+D]
>TRDV2 7/CAGGG/1 TRDD3 4/GCCC/4 TRAJ29 [TRA+D] BUG
TGCAAAGAACCTGGCTGTACTTAAGATACTTGCACCATCAGAGAGAGATGAAGGGTCTTACTACTGTGCCCA
GGGCTGGGGGAGCCCTTCAGGAAACACACCTCTTGTCTTTGGAAAGGGCACAAGACTTTCTGTGATTGCAAG
TAAGTGTTTCTAGCCA
......
......@@ -7,7 +7,7 @@ GGTGCAGATCTCCTGCAGGCTTCTGGATACACCTTCACCGGCTACTATATGCACTGGGTGCGACAGGCCCCTGGACAAGG
GCCTGGGATCATTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATACGTCGCCCTTTAGCAGCTCGTAGGATCTTTTTGACTACTGGGACCAGGGAACCCTGGTCACCGTCTCCTCAGGTA
#1473 vh3(l)
>IGHV3-53*01 2/9/10 IGHD3-10*01 4/8/1 IGHJ6*02 [IGH]
>IGHV3-53*01 2/9/10 IGHD3-10*01 4/8/1 IGHJ6*02 [IGH] BUG
GCGGGTCGTCAGTAGCACTACATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGTTATTTATAGCGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTTCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGACGTTTGTTGGGTTCGGGGAGTTATTAACCTTGGGTTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGTA
#1473 vh4
......@@ -152,7 +152,7 @@ GCGGGATCCTTCGTAGCTACGACATGCACTGGGTCCGCCAAGCTACAGGAAAAGGTCTGGAGTGGGTCTCAGCTATTGGT
AGCTCTGCTGATTTGTGGGCACTTATGAACCCGAAAGGACATGGCCATGGGGTGGGTAGGGACATAGGGACAGATGCCAGCCTGAGGTGGAGCCTCAGGACACAGGTGGGCACGGACACTATCCACATAAGCGAGGGATAGACCCGAGTGTCCCCACAGCAGACCTGAGAGCGCTGGGCCCACAGCCTCCCCTCAGAGCCCTGCTGCCTCCTCCGGTCAGCCCTGGACATCCCAGGTTTCCCCAGGCCTGCCGGTAGGTTTAGAATGAGGTCTGTGTCACTGTGGTATTACGATATTTTGACTGGTTATTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGTA
#1349 vh3
>(IGHV3-23*01, IGHV3-23D*01) 1/15/5 IGHD3-10*01 0/2/0 IGHJ3*02 [IGH]
>(IGHV3-23*01, IGHV3-23D*01) 1/15/5 IGHD3-10*01 0/2/0 IGHJ3*02 [IGH] BUG
CGCCCCACTTGATATTCCTTTTAGCCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGGGTGGGAGGTATTAAACTATGGTTCGGGGAGTTATTATAACGTTGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCCTCAGGTAACTC
#1347 vh2
......
......@@ -359,10 +359,9 @@ int main (int argc, char **argv)
"w-mer size used for the length of the extracted window ('" NO_LIMIT "': use all the read, no window clustering)")
-> group(group) -> level() -> transform(string_NO_LIMIT);
double expected_value = THRESHOLD_NB_EXPECTED;
app.add_option("--e-value,-e", expected_value,
"maximal e-value for determining if a V-J segmentation can be trusted", true)
double expected_value_kmer = NO_LIMIT_VALUE;
app.add_option("--e-value-kmer", expected_value_kmer,
"maximal e-value for the k-mer heuristics ('" NO_LIMIT "': use same value than '-e')", true)
-> group(group) -> level() -> transform(string_NO_LIMIT);
int trim_sequences = DEFAULT_TRIM;
......@@ -483,6 +482,11 @@ int main (int argc, char **argv)
// ----------------------------------------------------------------------------------------------------------------------
group = "Clone analysis (second pass)";
double expected_value = THRESHOLD_NB_EXPECTED;
app.add_option("--e-value,-e", expected_value,
"maximal e-value for determining if a V-J segmentation can be trusted", true)
-> group(group) -> transform(string_NO_LIMIT);
Cost segment_cost = DEFAULT_SEGMENT_COST ;
app.add_option("--analysis-cost",
[&segment_cost](CLI::results_t res) {
......@@ -784,7 +788,7 @@ int main (int argc, char **argv)
if (command == CMD_SEGMENT)
{
cout << endl
<< "* WARNING: " << PROGNAME << " was run with '-c" COMMAND_SEGMENT "' option" << endl ;
<< "* WARNING: " << PROGNAME << " was run with '-c " COMMAND_SEGMENT "' option" << endl ;
}
if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
......@@ -920,6 +924,10 @@ int main (int argc, char **argv)
// Number of reads for e-value computation
unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
if (expected_value_kmer == NO_LIMIT_VALUE)
{
expected_value_kmer = expected_value;
}
//////////////////////////////////
//$$ Read sequence files
......@@ -1108,7 +1116,7 @@ int main (int argc, char **argv)
WindowsStorage *windowsStorage = we.extract(reads, wmer_size,
windows_labels, only_labeled_windows,
keep_unsegmented_as_clone,
expected_value, nb_reads_for_evalue,
expected_value_kmer, nb_reads_for_evalue,
readScorer);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......@@ -1396,7 +1404,7 @@ int main (int argc, char **argv)
// Re-launch also a KmerMultiSegmenter, for control purposes (affectations, evalue)
KmerMultiSegmenter kmseg(representative, multigermline, 0, expected_value, nb_reads_for_evalue);
KmerMultiSegmenter kmseg(representative, multigermline, 0, expected_value_kmer, nb_reads_for_evalue);
KmerSegmenter *kseg = kmseg.the_kseg ;
if (verbose)
cout << "KmerSegmenter: " << kseg->getInfoLine() << endl;
......@@ -1440,10 +1448,15 @@ int main (int argc, char **argv)
// FineSegmenter
size_t nb_fine_segmented = (size_t) max_clones; // When -1, it will become the max value.
nb_fine_segmented = MIN(nb_fine_segmented, sort_clones.size());
FineSegmenter seg(representative, segmented_germline, segment_cost, expected_value, nb_fine_segmented, kmer_threshold, alternative_genes);
// The multiplier takes into account the expected_value_kmer.
// When --e-value-kmer is not set, the multiplier is 1.0. See #3594.
double fine_evalue_multiplier = MIN(expected_value_kmer, nb_fine_segmented);
FineSegmenter seg(representative, segmented_germline, segment_cost, expected_value, fine_evalue_multiplier, kmer_threshold, alternative_genes);
if (segmented_germline->seg_method == SEG_METHOD_543)
seg.FineSegmentD(segmented_germline, several_D, expected_value_D, nb_fine_segmented);
seg.FineSegmentD(segmented_germline, several_D, expected_value_D, fine_evalue_multiplier);
if (detect_CDR3)
seg.findCDR3();
......@@ -1643,6 +1656,9 @@ int main (int argc, char **argv)
Germline *not_segmented = new Germline(PSEUDO_NOT_ANALYZED, PSEUDO_NOT_ANALYZED_CODE);
// Multiplier is 1.0, we expect that the sequences are actual recombinations. See #3594.
double fine_evalue_multiplier = 1.0 ;
while (reads->hasNext())
{
nb++;
......@@ -1653,7 +1669,7 @@ int main (int argc, char **argv)
KmerSegmenter *seg = kmseg.the_kseg ;
Germline *germline = seg->segmented_germline ;
FineSegmenter s(seq, germline, segment_cost, expected_value, nb_reads_for_evalue, kmer_threshold, alternative_genes);
FineSegmenter s(seq, germline, segment_cost, expected_value, fine_evalue_multiplier, kmer_threshold, alternative_genes);
string id = string_of_int(nb, 6);
CloneOutput *clone = new CloneOutput();
......@@ -1670,7 +1686,7 @@ int main (int argc, char **argv)
nb_segmented++ ;
if (germline->seg_method == SEG_METHOD_543)
s.FineSegmentD(germline, several_D, expected_value_D, nb_reads_for_evalue);
s.FineSegmentD(germline, several_D, expected_value_D, fine_evalue_multiplier);
if (detect_CDR3)
s.findCDR3();
......
......@@ -92,6 +92,6 @@ GAAtcGAtAttGCAAAGAACCtGGCtGtACttAAGatACttGCACCAtCAGaGaGaGAtGAAGgGtCttACtACtGtGCC
# 0444-lil-TRA+D
>TRDV1*01 5//3 TRDD2*01 1//7 TRAJ29*01 [TRA+D] BUG
>TRDV1*01 5//3 TRDD2*01 1//7 TRAJ29*01 [TRA+D]
AGCGGGTGGTGATGGCAAATGCCAAGGAAGGGAAAAGGAAGAAGAGGGTTTTTATACTGATGTGTTTCATTGTGGGCACAGTGCTACAAAACCTACAGAGACCTGTACAAAAACTGCAGGGGCAAAAGTGCCATTTCCCTGGGATATCCTCACCCTGGTCCCAATGCAAAAAGTGGTCGCTATTCTGTCAACTTCAAGAAAGCAGCGAAATCCGTCGCCTTAACCATTTCAGCCTTACAGCTAGAAGATTCAG
CAAAGTACTTTTGTGCTCTTGGGTCCTAAGGAAACACACCTCTTGTCTTTGGAAAGGGCACAAGACTTTCTGTGATTGCAAGTAAGTGTTTCTAGCCATCCTTGATTTTGATCAGCAATGGCTTCTTCCCTTGAATTATTTTTCAGTGTACCTAGAATGCTTTTGCC
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment