Commit 992d06a7 authored by Marc Duez's avatar Marc Duez
Browse files
parents c9e8bf9f 7aaa26fd
......@@ -166,6 +166,39 @@ void MultiGermline::build_default_set(string path)
}
void MultiGermline::build_incomplete_set(string path)
{
// Should parse 'data/germlines.data'
Germline *germline;
// DH-JH
germline = new Germline("IGH+", 'h', path + "/IGHD.fa", "", path + "/IGHJ.fa", -10, 20);
germline->new_index("######-######");
germlines.push_back(germline);
// DD2-DD3
germline = new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDJ.fa", -10, 20);
germline->new_index("#########");
germlines.push_back(germline);
germline = new Germline("TRD+", 'd', path + "/TRDV.fa", "", path + "/TRDD3-01.fa", -10, 20);
germline->new_index("#########");
germlines.push_back(germline);
germline = new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDD3-01.fa", -10, 20);
germline->new_index("#########");
germlines.push_back(germline);
// IGK: KDE, INTRON
germline = new Germline("IGK", 'k', path + "/IGKV.fa", "", path + "/IGK-KDE.fa", -10, 80);
germline->new_index("#####-#####");
germlines.push_back(germline);
germline = new Germline("IGK", 'k', path + "/IGK-INTRON.fa", "", path + "/IGK-KDE.fa", -10, 80);
germline->new_index("#####-#####");
germlines.push_back(germline);
}
void MultiGermline::load_standard_set(string path)
{
......
......@@ -86,6 +86,7 @@ class MultiGermline {
void insert(Germline *germline);
void build_default_set(string path);
void build_incomplete_set(string path);
void load_standard_set(string path);
void insert_in_one_index(IKmerStore<KmerAffect> *_index);
......
......@@ -176,6 +176,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
segmented_germline = germline;
int s = (size_t)germline->index->getS() ;
int length = sequence.length() ;
......@@ -215,15 +216,12 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
strand = 2;
}
// Are there enoguh V/J to assert that this was the correct germline (and thus that we won't test other ones) ?
detected = (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD);
detected = false ;
computeSegmentation(strand, germline);
if (segmented)
{
// Yes, it is segmented
segmented_germline = germline;
germline->stats.insert(length);
reversed = (strand == -1);
......@@ -275,6 +273,11 @@ void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
max = kaa->getMaximum(KmerAffect(germline->affect_3, -1),
KmerAffect(germline->affect_5, -1));
// We labeled it detected if there were both enough affect_5 and enough affect_3
detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
&& (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
if (! max.max_found) {
if ((strand == 1 && max.nb_before_left == 0)
|| (strand == -1 && max.nb_after_right == 0))
......
......@@ -17,7 +17,7 @@
strand and the other, to safely attribute a
segment to a given strand */
#define DETECT_THRESHOLD 10 /* If the number of total V/J affectations
#define DETECT_THRESHOLD 5 /* If the number of both V and J affectations
is above this threshold, then the sequence
will be labeled as 'detected', and, if it
not segmented, the remaining germlines will
......
......@@ -46,7 +46,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
} else if (out_unsegmented) {
*out_unsegmented << reads->getSequence();
*out_unsegmented << "#" << segmented_mesg[seg.getSegmentationStatus()] << endl;
*out_unsegmented << "#" << segmented_mesg[seg.getSegmentationStatus()] << " " << seg.segmented_germline->code << endl;
if (seg.getSegmentationStatus() != UNSEG_TOO_SHORT) {
*out_unsegmented << seg.getKmerAffectAnalyser()->toString() << endl;
}
......
......@@ -157,6 +157,14 @@ void testSegmentationCause() {
TAP_TEST(! ks.isSegmented(), TEST_KMER_IS_SEGMENTED, "fewJ: " << ks.getKmerAffectAnalyser()->toString());
TAP_TEST(ks.getSegmentationStatus() == UNSEG_TOO_FEW_J, TEST_KMER_SEGMENTATION_CAUSE, "");
nb_checked++;
} else if (data.read(i).label == "seq-fewV2") {
TAP_TEST(! ks.isSegmented(), TEST_KMER_IS_SEGMENTED, "");
TAP_TEST(ks.getSegmentationStatus() == UNSEG_TOO_FEW_V, TEST_KMER_SEGMENTATION_CAUSE, "");
nb_checked++;
} else if (data.read(i).label == "seq-fewJ2") {
TAP_TEST(! ks.isSegmented(), TEST_KMER_IS_SEGMENTED, "fewJ: " << ks.getKmerAffectAnalyser()->toString());
TAP_TEST(ks.getSegmentationStatus() == UNSEG_TOO_FEW_J, TEST_KMER_SEGMENTATION_CAUSE, "");
nb_checked++;
} else if (data.read(i).label == "seq-delta-min-old") {
// This test was a test for delta_min but with the CountKmerAffectAnalyser
// the read is segmented, now. So we keep it, but change the test
......@@ -190,7 +198,7 @@ void testSegmentationCause() {
}
}
TAP_TEST(nb_checked == 11, TEST_KMER_DATA, "");
TAP_TEST(nb_checked == 13, TEST_KMER_DATA, "");
delete multi;
}
......@@ -218,15 +226,15 @@ void testExtractor() {
WindowsStorage *ws = we.extract(&data, multi, 30, labels);
TAP_TEST(we.getNbReads() == 11, TEST_EXTRACTOR_NB_READS, "");
TAP_TEST(we.getNbReads() == 13, TEST_EXTRACTOR_NB_READS, "");
TAP_TEST(we.getNbSegmented(SEG_PLUS) == 4, TEST_EXTRACTOR_NB_SEGMENTED, "segPlus: " << we.getNbSegmented(SEG_PLUS));
TAP_TEST(we.getNbSegmented(SEG_MINUS) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_SHORT) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_STRAND_NOT_CONSISTENT) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_FEW_ZERO) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_FEW_V) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_FEW_J) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_FEW_V) == 2, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_TOO_FEW_J) == 2, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_BAD_DELTA_MIN) == 0, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(UNSEG_BAD_DELTA_MAX) == 1, TEST_EXTRACTOR_NB_SEGMENTED, "");
TAP_TEST(we.getNbSegmented(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW) == 2, TEST_EXTRACTOR_NB_SEGMENTED, "");
......@@ -237,8 +245,8 @@ void testExtractor() {
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_SHORT) == 4, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_STRAND_NOT_CONSISTENT) == 36, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_FEW_ZERO) == 36, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_FEW_V) == 36, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_FEW_J) == 36, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_FEW_V) == 51, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_TOO_FEW_J) == 55, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(UNSEG_BAD_DELTA_MAX) == 66, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW) == 28.5, TEST_EXTRACTOR_AVG_LENGTH, "");
TAP_TEST(we.getAverageSegmentationLength(TOTAL_SEG_AND_WINDOW) == 48, TEST_EXTRACTOR_AVG_LENGTH, "");
......
!LAUNCH: ../../vidjil -A -g ../../germline ../../data/trd-dd2-dd3.fa
$ Segment only 2 reads, because there is no -i
1:segmented 2 reads .40..
!LAUNCH: ../../vidjil -A -i -g ../../germline ../../data/trd-dd2-dd3.fa
$ Segment all 5 reads, thanks to -i
1:segmented 5 reads .100..
......@@ -149,6 +149,7 @@ void usage(char *progname)
<< " -J <file> J germline multi-fasta file" << endl
<< " -G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
<< " -g <path> multiple germlines (in the path <path>, takes TRA, TRB, TRG, TRD, IGH and IGL and sets window prediction parameters)" << endl
<< " -i multiple germlines, also incomplete rearrangements (experimental, must be used with -g)" << endl
<< endl
<< "Window prediction" << endl
......@@ -279,6 +280,7 @@ int main (int argc, char **argv)
bool output_segmented = false;
bool output_unsegmented = false;
bool multi_germline = false;
bool multi_germline_incomplete = false;
string multi_germline_file = DEFAULT_MULTIGERMLINE;
string forced_edges = "" ;
......@@ -294,7 +296,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
while ((c = getopt(argc, argv, "Ahaig:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
switch (c)
{
......@@ -341,7 +343,11 @@ int main (int argc, char **argv)
multi_germline_file = string(optarg);
germline_system = "multi" ;
break;
case 'i':
multi_germline_incomplete = true;
break;
case 'G':
germline_system = string(optarg);
f_reps_V.push_back((germline_system + "V.fa").c_str()) ;
......@@ -676,6 +682,8 @@ int main (int argc, char **argv)
if (multi_germline)
{
multigermline->build_default_set(multi_germline_file);
if (multi_germline_incomplete)
multigermline->build_incomplete_set(multi_germline_file);
}
else if (command == CMD_GERMLINES)
{
......
......@@ -10,8 +10,12 @@ tgtgccacctgggacaggAGTTTCTTATAATAATTC
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>seq-fewV
AAAAAAAAAAAAAAAAAAGAATTATTATAAGAAACT
>seq-fewV2
AAAAAAAAAAAAAAAAAAGAATTATTATAAGAAACTCTTTGGCAGTGGAACAACACTGGTTGTCAC
>seq-fewJ
tgtgccacctgggacaggAAAAAAAAAAAAAAAAAA
>seq-fewJ2
tgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacaggAAAAAAAAAAAAAAAAAA
>seq-delta-min-old
tgtgccacctgggacaggGAATTATTATAAGAAACTtgtgccacctgggacaggGAATTATTATAAGAAACT
>seq-delta-min
......
......@@ -49,8 +49,8 @@ germline_data = {
"shortcut": "d",
"description": "Human T-cell receptor, delta locus (14q11.2), incomplete Dd2-Dd3 rearrangements",
"follows": "TRD",
"5": ["TRDV.fa", "TRDD-d2.fa"],
"3": ["TRDJ.fa", "TRDD-d3.fa"]
"5": ["TRDV.fa", "TRDD2-01.fa"],
"3": ["TRDJ.fa", "TRDD3-01.fa"]
},
"IGH": {
......@@ -86,8 +86,8 @@ germline_data = {
"shortcut": "k",
"description": "Human immunoglobulin, kappa locus (2p11.2), Vk-KDE and Intron-KDE rearrangements",
"follows": "IGK",
"5": ["IGKV.fa", "INTRON.fa"],
"3": ["KDE.fa"]
"5": ["IGKV.fa", "IGK-INTRON.fa"],
"3": ["IGK-KDE.fa"]
},
"IGL": {
......
......@@ -6,5 +6,8 @@ wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fas
python buildBrowserGermline.py *.fa germlines.data ../browser/js/germline.js
wget http://rbx.vidjil.org/browser/germline/IGK-INTRON.fa
wget http://rbx.vidjil.org/browser/germline/IGK-KDE.fa
wget -N http://rbx.vidjil.org/browser/germline/IGK-INTRON.fa
wget -N http://rbx.vidjil.org/browser/germline/IGK-KDE.fa
wget -q -O - 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=M22153&rettype=fasta&retmode=text&to=42' | sed '1s/>gb\|/>gb|TRDD2*01/' > TRDD2-01.fa
wget -q -O - 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=M22152&rettype=fasta&retmode=text&from=214&to=258' | sed '1s/>gb\|/>gb|TRDD3*01/' > TRDD3-01.fa
......@@ -29,8 +29,6 @@ def verbose_open_w(name):
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
'TRDD2*01',
'TRDD3*01',
]
for l in sys.stdin:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment