Commit 01d68fed authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/2138-finesegmenter-start-V-end-J' into 'dev'

Outputs also the start of V and the end of J

Closes #2138

See merge request !705
parents 4123a20b 825730c2
Pipeline #150836 passed with stages
in 15 minutes and 18 seconds
......@@ -81,18 +81,28 @@ string AlignBox::getSequence(string sequence) {
return sequence.substr(start, end-start+1);
}
bool AlignBox::CoverFirstPos()
{
return (start <= 0);
}
bool AlignBox::CoverLastPos()
{
return (end >= seq_length - 1);
}
void AlignBox::addToOutput(CloneOutput *clone, int alternative_genes) {
json j;
j["name"] = ref_label;
if (key != "3") // no end information for J
if (key != "3" || !CoverLastPos()) // end information for J
{
j["stop"] = end + 1;
j["delRight"] = del_right;
}
if (key != "5") // no start information for V
if (key != "5" || !CoverFirstPos()) // start information for V
{
j["start"] = start + 1;
j["delLeft"] = del_left;
......@@ -1121,10 +1131,6 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
box_V, box_J, segment_cost, reverse_V, reverse_J);
// Reset extreme positions
box_V->start = 0;
box_J->end = sequence.length()-1;
// Why could this happen ?
if (box_J->start>=(int) sequence.length())
box_J->start=sequence.length()-1;
......
......@@ -132,6 +132,12 @@ class AlignBox
*/
int getLength();
/**
* Return whether the box cover the first/last position od the reference sequence
*/
bool CoverFirstPos();
bool CoverLastPos();
/**
* Returns the position in the reference string corresponding to the position in the read
* Preliminary implementation, only works for the start of V and J boxes
......
# This synthetic sequence ends after the end of the J gene,
# including a stop codon in all frames in that zone
# end of the J gene @90
>TRGV9*01 0/CAT/0 TRGJP*01 0/17
acctactactgtgccttgtgggaggtgCAT
tgggcaagagttgggcaaaaaaatcaaggtatttggtcccggaacaaagcttatcattacag
cccTAGcTAGcTAGccc
# These two recombinations start before the start of the V gene, including a stop codon in that zone.
# They should be productive
# start of the V gene @133
# end of the J gene @534
>132/0 IGHV1-69*06 2/AACCCCCAACAAAGC/4 IGHD3-3*01 9/CCCATAAGTACCGT/1 IGHJ6*02 40/GGTAAG [IGH]
CTGGAGGTTCCTCTTTGTGGTGGCAGCAGCTACAGGTAAGGGGCTTCCTAGTCCTAAGGCTGAGGAAGGGATCCTGGTTTAGTTAAAGAGGATTTTATTCACCCCTGTGTCCTGTCCACAGGTGTCCAGTCC
CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCGGTGAAGGTCTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAGGGATCATCCCTATCTTTGGTACAGCAAACTACGCACAGAAGTTCCAGGGCAGAGTCACGATTACCGCGGACAAATCCACGAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAAACCCCCAACAAAGCTACGATTTTTGGAGTGGTCCCATAAGTACCGTTTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAG
GTAAG
# start of the V gene @146
>145/0 IGHV3-74*01 0/GACCG/6 IGHD1-7*01 0/GC/10 IGHJ4*02 [IGH]
AGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGATTCATGGAGAATTGGAGATGTGGAGTGTGAATGGACATGGGTGAGATAAGCAGTGGATGTGTGTGGCAGTTTCTGACCAGGGTGTCTCTGTGTTTGCAGGTGTCTCGTGT
GAGGTGCTTCTGGTGGAGTCCGGGGGAGGCTTAGTTAAGGCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATTACTGGATGCACTGGGTCCGCCAAGTTCCAGGGAAGGGCCTGGAGTGGGTCTCACGTATTAACAGCGATGGGAGCGCCATAAGGTACGCGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGGACACACTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTTTATTATTGTGCAAGAGAGACCGACTGGAATTACGCCTACTGGGGCCAGGGAACCCTGGTC
# actual sequence was continuing after J
# ... ACCGTCTCCT CAGGTAAG
......@@ -17,5 +17,7 @@ $ First sequence, easy segmentation (no error, few deletions at the windows, sma
# Note that a second D (D3-16*01) can be detected (6 common nucleotides, or even 7 with an overlap on the first D)
# This is tested in segment_simul.should-vdj
$ seq1_polyT is the same as seq1 but starts and ends with 10Ts, the recombination is in the middle. start position should not be 0 and end position should not be length-1; See #2138
f1:^>seq1_polyT \\+ VDJ 11 83 88 100 105 150 IGHV5-10-1.0[1-4] 2/CTTC/3 IGHD1-14.01 1/GTTA/5 IGHJ5.01
# seq1_polyT is the same as seq1 but starts and ends with 10Ts, the recombination is in the middle.
$ Sequence with polyT is properly designated
1:^>seq1_polyT \+ VDJ 11 83 88 100 105 150
1:^>seq1_polyT .* IGHV5-10-1.0[1-4] 2/CTTC/3 IGHD1-14.01 1/GTTA/5 IGHJ5.01
## Now a productive sequence, with a stop codon after a TRGJ gene
## We now use -J ../TRGJ.fa, see #3147
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -3 -V $VIDJIL_DIR/germline/homo-sapiens/TRGV.fa -J $VIDJIL_DIR/germline/homo-sapiens/TRGJ.fa $VIDJIL_DATA/productive_stop_after_J.fa
!OUTPUT_FILE: out/productive_stop_after_J.vidjil
$ Correct number of start positions (no V, but J, cdr3 and junction))
3: "start"
$ Correct stop position of TRGJP*01
1: "stop": 92
$ The sequence is productive
f1: "productive": true
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/productive_stop_before_V.fa
!OUTPUT_FILE: out/productive_stop_before_V.vidjil
$ first clone name
1: "IGHV1-69.06 2/AACCCCCAACAAAGC/4 IGHD3-3.01 9/CCCATAAGTACCGT/1 IGHJ6.02"
$ second clone name
1: "IGHV3-74.01 0/GACCG/6 IGHD1-7.01 0/GC/10 IGHJ4.02"
# There are 10 = 2x5 segments (V, D, J, cdr3 and junction)
$ Correct number of start positions
10: "start"
$ Correct start position of IGHV1-69*06
1: "start": 133
$ Correct stop position of IGHJ6*02
1: "stop": 534
$ Correct start position of IGHV3-74*01
1: "start": 146
$ Correct number of stop positions (10-1, no stop positions for J on the second sequence)
9: "stop"
$ The two sequences are productive
f2: "productive": true
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment