Attention une mise à jour du serveur va être effectuée le lundi 17 mai entre 13h et 13h30. Cette mise à jour va générer une interruption du service de quelques minutes.

Commit ddb45bcd authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature/species' into 'dev'

Feature/species

Closes #1987

See merge request !4
parents 74fdac65 41096892
......@@ -231,6 +231,7 @@ ostream &operator<<(ostream &out, const Germline &germline)
MultiGermline::MultiGermline(IndexTypes indexType, bool _one_index_per_germline):indexType(indexType)
{
species = "custom germlines" ;
index = NULL;
one_index_per_germline = _one_index_per_germline;
}
......@@ -261,7 +262,12 @@ void MultiGermline::build_from_json(string path, string json_filename, int filte
string content( (std::istreambuf_iterator<char>(germline_data) ),
(std::istreambuf_iterator<char>() ) );
json j = json::parse(content);
json germlines = json::parse(content);
species = germlines["species"].get<std::string>();
species_taxon_id = germlines["species_taxon_id"];
json j = germlines["systems"];
//for each germline
for (json::iterator it = j.begin(); it != j.end(); ++it) {
......@@ -370,6 +376,8 @@ void MultiGermline::mark_cross_germlines_as_ambiguous()
ostream &operator<<(ostream &out, const MultiGermline &multigermline)
{
out << multigermline.species << " (" << multigermline.species_taxon_id << ")" << endl ;
for (list<Germline*>::const_iterator it = multigermline.germlines.begin(); it != multigermline.germlines.end(); ++it)
{
Germline *germline = *it ;
......
......@@ -131,6 +131,9 @@ class MultiGermline {
bool one_index_per_germline;
list <Germline*> germlines;
string species;
int species_taxon_id;
// A unique index can be used
IKmerStore<KmerAffect> *index;
......
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -A -g $VIDJIL_DIR/data/chimera-fake-VJ-germlines.data -i $VIDJIL_DIR/data/chimera-fake-VJ.fa
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -A -g $VIDJIL_DIR/data/chimera-fake-VJ.g -i $VIDJIL_DIR/data/chimera-fake-VJ.fa
# Testing a custom (fake) germlines.data
$ Report the species
1: Fake .123.
$ Loads from 'chimera-fake-VJ-germlines.data' a custom germline
1: Y-.* l14 k13
......
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline -g $VIDJIL_DIR/germline/isotypes.data -i $VIDJIL_DIR/data/isotypes.fa
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline -g $VIDJIL_DIR/germline/isotypes.g -i $VIDJIL_DIR/data/isotypes.fa
$ Report the correct species
1: Homo sapiens .9606.
$ Segment the three reads
1: junction detected in 3 reads
......
!REQUIRES: python $VIDJIL_DIR/tools/check_python_version.py
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -3 -z 1 -G $VIDJIL_DIR/germline/IGH -w 60 -r 5 -e 10 -b data $VIDJIL_DIR/data/Stanford_S22.fasta > /dev/null ; cat out/data.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ Custom germlines
1:"species": "custom germlines"
$ Number of reads
e1:"total": [13153]
......
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -e 10 -z 0 -V $VIDJIL_DIR/germline/IGHV.fa -D $VIDJIL_DIR/germline/IGHD.fa -J $VIDJIL_DIR/germline/IGHJ.fa -s \\\\#\\\\#\\\\#\\\\#\\\\#\\\\#-\\\\#\\\\#\\\\#\\\\#\\\\#\\\\# $VIDJIL_DIR/data/Stanford_S22.fasta
$ Germlines are custom
1: custom germlines
$ Parses IGHV.fa germline
1: 102221 bp in 350 sequences
......
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline/Makefile 2>&1
!EXIT_CODE: 1
$ Error, incorrect *.g
1:error.* cannot properly read
......@@ -69,7 +69,7 @@
//$$ #define (mainly default options)
#define DEFAULT_MULTI_GERMLINE_PATH "germline/"
#define DEFAULT_MULTI_GERMLINE_FILE "germlines.data"
#define DEFAULT_MULTI_GERMLINE_FILE "homo-sapiens.g"
#define DEFAULT_READ_HEADER_SEPARATOR " "
#define DEFAULT_READS "./data/Stanford_S22.fasta"
......@@ -163,9 +163,9 @@ void usage(char *progname, bool advanced)
<< " -D <file> D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components" << endl
<< " -J <file> J germline multi-fasta file" << endl
<< " -G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
<< " -g <path> multiple locus/germlines. In the path <path>, takes 'germlines.data' to select locus and parameters" << endl
<< " Selecting '-g germline' processes TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations" << endl
<< " A different 'germlines.data' file can also be provided with -g <file>" << endl
<< " -g <path> multiple locus/germlines. In the path <path>, takes '" << DEFAULT_MULTI_GERMLINE_FILE << "' to select locus and parameters" << endl
<< " Selecting '-g germline' processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations" << endl
<< " Files different than '" << DEFAULT_MULTI_GERMLINE_FILE << "', for example for other species, can also be provided with -g <file>" << endl
<< endl
<< "Locus/recombinations" << endl
......@@ -874,7 +874,14 @@ int main (int argc, char **argv)
if (multi_germline)
{
for (pair <string, string> path_file: multi_germline_paths_and_files)
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR, trim_sequences);
{
try {
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR, trim_sequences);
} catch (std::exception& e) {
cerr << ERROR_STRING << "Vidjil cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
exit(1);
}
}
}
else
{
......@@ -930,7 +937,7 @@ int main (int argc, char **argv)
multigermline->mark_cross_germlines_as_ambiguous();
multigermline->finish();
cout << "Germlines loaded" << endl ;
cout << "Germlines loaded: " ;
cout << *multigermline ;
cout << endl ;
......@@ -1570,6 +1577,9 @@ int main (int argc, char **argv)
// Complete main json output
j["species"] = multigermline->species ;
j["species_taxon_id"] = multigermline->species_taxon_id ;
j["diversity"] = jsonDiversity ;
j["samples"]["log"] = { stream_segmentation_info.str() } ;
j["reads"] = {
......
{
"species": "Fake",
"species_taxon_id": 123,
"systems": {
"Y-Vb/Jg": {
"shortcut": "y",
"description": "Fake chimera locus",
......@@ -26,4 +30,5 @@
"seed": "12s"
}
}
}
}
\ No newline at end of file
......@@ -228,9 +228,9 @@ Germline databases (at least one -V/(-D)/-J, or -G, or -g option must be given f
-D <file> D germline multi-fasta file (and resets -m and -w options), will segment into V(D)J components
-J <file> J germline multi-fasta file
-G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)
-g <path> multiple locus/germlines. In the path <path>, takes 'germlines.data' to select locus and parameters
Selecting '-g germline' processes TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations
A different 'germlines.data' file can also be provided with -g <file>
-g <path> multiple locus/germlines. In the path <path>, takes 'homo-sapiens.g' to select locus and parameters
Selecting '-g germline' processes human TRA, TRB, TRG, TRD, IGH, IGK and IGL locus, possibly with some incomplete/unusal recombinations
Files different than 'homo-sapiens.g', for example for other species, can also be provided with -g <file>
Locus/recombinations
-d try to detect several D (experimental)
......@@ -245,14 +245,14 @@ Locus/recombinations
Using =-g germline/ -i= tests also some incomplete and unusual recombinations (locus with a =+= in their name),
and using =-g germline/ -i -2= further test unexpected recombinations (tagged as =xxx=).
See [[http://git.vidjil.org/blob/master/doc/locus.org][locus.org]] for information on the analyzable locus.
- Analyzed locus and parameters are configured through the =germline/germlines.data= file.
- Analyzed locus and parameters are configured through the =germline/homo-sapiens.g= file.
A =germline/isotypes.data= file is provided to look for sequences with, on one side, IGHJ (or even IGHV) genes,
and, on the other side, an IGH constant chain.
To select a custom set of TR or Ig locus, you may copy =germline/germlines.data= into a new file,
as for example =germline/custom-germlines.data=, and run Vidjil with =-g germline/custom-germlines.data -i -2=.
- Several =-g= options can be used, as for instance =-g germline -g germline/isotypes.data=.
To select a custom set of TR or Ig locus, you may copy =germline/homo-sapiens.g= into a new file,
as for example =germline/custom.g=, and run Vidjil with =-g germline/custom.g -i -2=.
- Several =-g= options can be used, as for instance =-g germline -g germline/isotypes.g=.
- One can use other germline sequences possibly by defining another
=germlines.data= file that would refer to an alternative germline set or by
=.g= file that would refer to an alternative germline set or by
overwriting the existing germline sequences (in the FASTA file).
** Main algorithm parameters
......
{
"TRA": {
"shortcut": "A",
"color" : "#268bd2",
"description": "Human T-cell receptor, alpha locus (14q11.2)",
"recombinations": [ {
"5": ["TRAV.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRB": {
"shortcut": "B",
"color" : "#cb4b16",
"description": "Human T-cell receptor, beta locus (7q34)",
"recombinations": [ {
"5": ["TRBV.fa"],
"4": ["TRBD.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRB+": {
"shortcut": "b",
"color" : "#eb6b36",
"description": "Human T-cell receptor, beta locus (7q34), incomplete Db-Jb recombinations",
"follows": "TRB",
"recombinations": [ {
"5": ["TRBD_upstream.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRG": {
"shortcut": "G",
"color" : "#dc322f",
"description": "Human T-cell receptor, gamma locus (7p14)",
"recombinations": [ {
"5": ["TRGV.fa"],
"3": ["TRGJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRD": {
"shortcut": "D",
"color" : "#b58900",
"description": "Human T-cell receptor, delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRA+D": {
"shortcut": "a",
"color" : "#46abf2",
"description": "Human T-cell receptor, alpha/delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRAJ.fa"]
}, {
"5": ["TRDD_upstream.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRD+": {
"shortcut": "d",
"color" : "#d5a920",
"description": "Human T-cell receptor, delta locus (14q11.2), incomplete Dd2-Dd3 recombinations",
"follows": "TRD",
"recombinations": [ {
"5": ["TRDV.fa"],
"3": ["TRDD3_downstream.fa"]
}, {
"5": ["TRDD2_upstream.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
}, {
"5": ["TRDD2_upstream.fa"],
"3": ["TRDD3_downstream.fa"]
} ],
"parameters": {
"seed": "9s"
}
},
"IGH": {
"shortcut": "H",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33)",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHD.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGH+": {
"shortcut": "h",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), incomplete Dh-Jh recombinations",
"follows": "IGH",
"recombinations": [ {
"5": ["IGHD_upstream.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGK": {
"shortcut": "K",
"color" : "#2aa198",
"description": "Human immunoglobulin, kappa locus (2p11.2)",
"recombinations": [ {
"5": ["IGKV.fa"],
"3": ["IGKJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGK+": {
"shortcut": "k",
"color" : "#4ac1a8",
"description": "Human immunoglobulin, kappa locus (2p11.2), Vk-KDE and Intron-KDE recombinations",
"follows": "IGK",
"recombinations": [ {
"5": ["IGKV.fa", "IGK-INTRON.fa"],
"3": ["IGK-KDE.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGL": {
"shortcut": "L",
"color" : "#d33682",
"description": "Human immunoglobulin, lambda locus (22q11.2)",
"recombinations": [ {
"5": ["IGLV.fa"],
"3": ["IGLJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
}
}
{
"species": "Homo sapiens",
"species_taxon_id": 9606,
"systems": {
"TRA": {
"shortcut": "A",
"color" : "#268bd2",
"description": "Human T-cell receptor, alpha locus (14q11.2)",
"recombinations": [ {
"5": ["TRAV.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRB": {
"shortcut": "B",
"color" : "#cb4b16",
"description": "Human T-cell receptor, beta locus (7q34)",
"recombinations": [ {
"5": ["TRBV.fa"],
"4": ["TRBD.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRB+": {
"shortcut": "b",
"color" : "#eb6b36",
"description": "Human T-cell receptor, beta locus (7q34), incomplete Db-Jb recombinations",
"follows": "TRB",
"recombinations": [ {
"5": ["TRBD_upstream.fa"],
"3": ["TRBJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"TRG": {
"shortcut": "G",
"color" : "#dc322f",
"description": "Human T-cell receptor, gamma locus (7p14)",
"recombinations": [ {
"5": ["TRGV.fa"],
"3": ["TRGJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRD": {
"shortcut": "D",
"color" : "#b58900",
"description": "Human T-cell receptor, delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"TRA+D": {
"shortcut": "a",
"color" : "#46abf2",
"description": "Human T-cell receptor, alpha/delta locus (14q11.2)",
"recombinations": [ {
"5": ["TRDV.fa"],
"4": ["TRDD.fa"],
"3": ["TRAJ.fa"]
}, {
"5": ["TRDD_upstream.fa"],
"3": ["TRAJ.fa"]
} ],
"parameters": {
"seed": "13s"
}
},
"TRD+": {
"shortcut": "d",
"color" : "#d5a920",
"description": "Human T-cell receptor, delta locus (14q11.2), incomplete Dd2-Dd3 recombinations",
"follows": "TRD",
"recombinations": [ {
"5": ["TRDV.fa"],
"3": ["TRDD3_downstream.fa"]
}, {
"5": ["TRDD2_upstream.fa"],
"4": ["TRDD.fa"],
"3": ["TRDJ.fa"]
}, {
"5": ["TRDD2_upstream.fa"],
"3": ["TRDD3_downstream.fa"]
} ],
"parameters": {
"seed": "9s"
}
},
"IGH": {
"shortcut": "H",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33)",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHD.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGH+": {
"shortcut": "h",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), incomplete Dh-Jh recombinations",
"follows": "IGH",
"recombinations": [ {
"5": ["IGHD_upstream.fa"],
"3": ["IGHJ.fa"]
} ],
"parameters": {
"seed": "12s"
}
},
"IGK": {
"shortcut": "K",
"color" : "#2aa198",
"description": "Human immunoglobulin, kappa locus (2p11.2)",
"recombinations": [ {
"5": ["IGKV.fa"],
"3": ["IGKJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGK+": {
"shortcut": "k",
"color" : "#4ac1a8",
"description": "Human immunoglobulin, kappa locus (2p11.2), Vk-KDE and Intron-KDE recombinations",
"follows": "IGK",
"recombinations": [ {
"5": ["IGKV.fa", "IGK-INTRON.fa"],
"3": ["IGK-KDE.fa"]
} ],
"parameters": {
"seed": "10s"
}
},
"IGL": {
"shortcut": "L",
"color" : "#d33682",
"description": "Human immunoglobulin, lambda locus (22q11.2)",
"recombinations": [ {
"5": ["IGLV.fa"],
"3": ["IGLJ.fa"]
} ],
"parameters": {
"seed": "10s"
}
}
}
}
{
"IgVC": {
"shortcut": "C",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",
"IGHC=A2.fa",
"IGHC=GP.fa"
]
} ],
"parameters": {
"seed": "12s"
}
},
"IgJC": {
"shortcut": "c",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",
"IGHC=A2.fa",
"IGHC=GP.fa"
]
} ],
"parameters": {
"seed": "12s"
}
}
}
{
"species": "Homo sapiens",
"species_taxon_id": 9606,
"systems": {
"IgVC": {
"shortcut": "C",
"color" : "#6c71c4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHV.fa"],
"4": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",
"IGHC=A2.fa",
"IGHC=GP.fa"
]
} ],
"parameters": {
"seed": "12s"
}
},
"IgJC": {
"shortcut": "c",
"color" : "#8c91e4",
"description": "Human immunoglobulin, heavy locus (14q32.33), with constant heavy chains",
"recombinations": [ {
"5": ["IGHJ.fa"],
"3": ["IGHC=M.fa",
"IGHC=D.fa",
"IGHC=G3.fa",
"IGHC=G1.fa",
"IGHC=A1.fa",
"IGHC=G2.fa",
"IGHC=G4.fa",
"IGHC=E.fa",