Commit 7113d43e authored by Mikaël Salson's avatar Mikaël Salson
Browse files

Merge branch 'feature-a/4271-update-germlines' into 'dev'

Update germlines to 2021-01-21

Closes #4660, #4725, and #4271

See merge request !885
parents fc18f0aa e5fa98b5
Pipeline #233239 failed with stages
in 3 minutes and 36 seconds
#include "filter.h" #include "filter.h"
#include "math.hpp" #include "math.hpp"
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed) : originalBioReader(origin){ FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed, float keys_compress) : originalBioReader(origin){
this->filtered_sequences_nb = 0; this->filtered_sequences_nb = 0;
this->filtered_sequences_calls = 0; this->filtered_sequences_calls = 0;
buildACAutomatonToFilterBioReader(seed); buildACAutomatonToFilterBioReader(seed, keys_compress);
} }
FilterWithACAutomaton::~FilterWithACAutomaton(){ FilterWithACAutomaton::~FilterWithACAutomaton(){
...@@ -16,7 +16,7 @@ FilterWithACAutomaton::~FilterWithACAutomaton(){ ...@@ -16,7 +16,7 @@ FilterWithACAutomaton::~FilterWithACAutomaton(){
} }
} }
void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed, float keys_compress){
char asciiChar; char asciiChar;
int asciiNumber; int asciiNumber;
string currentLabel; string currentLabel;
...@@ -32,12 +32,22 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){ ...@@ -32,12 +32,22 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){
asciiNumber = SPECIFIC_KMERS_NUMBER; asciiNumber = SPECIFIC_KMERS_NUMBER;
automaton->insert(originalBioReader.sequence(0),std::string("") + char(asciiNumber), true, 0, seed); automaton->insert(originalBioReader.sequence(0),std::string("") + char(asciiNumber), true, 0, seed);
indexes->push_back(0); indexes->push_back(0);
int previousAsciiNumber = asciiNumber;
int rawNumber = 0;
previousLabel = extractGeneName(originalBioReader.label(0)); previousLabel = extractGeneName(originalBioReader.label(0));
for(int i = 1;i < originalBioReader.size(); ++i){ for(int i = 1;i < originalBioReader.size(); ++i){
currentLabel = extractGeneName(originalBioReader.label(i)); currentLabel = extractGeneName(originalBioReader.label(i));
if(currentLabel != previousLabel){ if(currentLabel != previousLabel){
asciiNumber = SPECIFIC_KMERS_NUMBER + 1 + (int) rawNumber / keys_compress;
rawNumber++;
}
if (asciiNumber > previousAsciiNumber)
{
indexes->push_back(i); indexes->push_back(i);
asciiNumber++; previousAsciiNumber = asciiNumber;
} }
if(asciiNumber > 127){ if(asciiNumber > 127){
cerr << WARNING_STRING << "Pre-filtering disabled" << endl; cerr << WARNING_STRING << "Pre-filtering disabled" << endl;
......
...@@ -19,7 +19,7 @@ class FilterWithACAutomaton { ...@@ -19,7 +19,7 @@ class FilterWithACAutomaton {
/* The size of the BioReader returned after filtering.*/ /* The size of the BioReader returned after filtering.*/
int filtered_sequences_nb; int filtered_sequences_nb;
FilterWithACAutomaton(BioReader &origin, string seed); FilterWithACAutomaton(BioReader &origin, string seed, float keys_compress=1.0);
~FilterWithACAutomaton(); ~FilterWithACAutomaton();
...@@ -93,7 +93,7 @@ class FilterWithACAutomaton { ...@@ -93,7 +93,7 @@ class FilterWithACAutomaton {
The param "seed" is used while inserting sequences in the automaton. By default The param "seed" is used while inserting sequences in the automaton. By default
the seed has a size of 10. the seed has a size of 10.
*/ */
void buildACAutomatonToFilterBioReader(string seed); void buildACAutomatonToFilterBioReader(string seed, float keys_compress);
/** /**
* Return the vector of indexes used while building the automaton. * Return the vector of indexes used while building the automaton.
......
...@@ -26,7 +26,7 @@ void Germline::init(string _code, char _shortcut, ...@@ -26,7 +26,7 @@ void Germline::init(string _code, char _shortcut,
affect_5 = string(1, toupper(shortcut)) + "-" + code + "V"; affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
affect_4 = string(1, 14 + shortcut) + "-" + code + "D"; affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J"; affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, this->seed_5) : nullptr; filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, this->seed_5, KEYS_COMPRESS) : nullptr;
} }
......
...@@ -33,6 +33,7 @@ enum SEGMENTATION_METHODS { ...@@ -33,6 +33,7 @@ enum SEGMENTATION_METHODS {
#define PSEUDO_NOT_ANALYZED "not analyzed" #define PSEUDO_NOT_ANALYZED "not analyzed"
#define PSEUDO_NOT_ANALYZED_CODE 'z' #define PSEUDO_NOT_ANALYZED_CODE 'z'
#define KEYS_COMPRESS 1.65 // enough for ~208 *01 genes (191 IGHV + ...) / 127
using namespace std; using namespace std;
using json = nlohmann::json; using json = nlohmann::json;
......
...@@ -470,16 +470,14 @@ void sigintHandler(int sig_num) ...@@ -470,16 +470,14 @@ void sigintHandler(int sig_num)
} }
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
/*
Return the part of label before the star
For example:
IGHV5-51*01 -> IGHV5-51
If there is no star in the name, the whole label is returned.
IGHV10-40 -> IGHV10-40
*/
string extractGeneName(string label){ string extractGeneName(string label){
string result; string result;
size_t pipe_pos = label.find("|");
if (pipe_pos != string::npos) {
label = label.substr(pipe_pos+1);
}
size_t star_pos; size_t star_pos;
star_pos = label.rfind("*"); star_pos = label.rfind("*");
if(star_pos != string::npos){ if(star_pos != string::npos){
......
...@@ -108,9 +108,10 @@ extern bool global_interrupted; ...@@ -108,9 +108,10 @@ extern bool global_interrupted;
void sigintHandler(int sig_num); void sigintHandler(int sig_num);
/* /*
Extract the gene name from a label. This take the whole part Extract the gene name from a label.
before the star and returns it. If there is no star in the If there is a pipe '|', consider only what is after the (first) pipe.
name the whole label is returned. If there is a star '*', consider only what is before the start
M99686|IGHV5-51*01|Homo sapiens|... -> IGHV5-51
IGHV-01*05 -> IGHV-01 IGHV-01*05 -> IGHV-01
IGHV-7500AB -> IGHV-7500AB IGHV-7500AB -> IGHV-7500AB
*/ */
......
...@@ -5,7 +5,8 @@ ...@@ -5,7 +5,8 @@
# Most sequences are not reported as 'only V', but as 'too few V/J' # Most sequences are not reported as 'only V', but as 'too few V/J'
1: UNSEG only V/.* 1 1: UNSEG only V/.* 1
1: UNSEG too few V/J .* 11 # see #4730
f1: UNSEG too few V/J .* 11
1: UNSEG too few V/J .* 9
# upstream (19 bp) + IGHD7-27*01 (11 bp) + 92 bp + start of IGHJ1*01 (39 bp)
>NC_000014.9:c105865580-105865420 Homo sapiens chromosome 14, GRCh38.p12 Primary Assembly >NC_000014.9:c105865580-105865420 Homo sapiens chromosome 14, GRCh38.p12 Primary Assembly
TGAGCTGAGAACCACTGTGCTAACTGGGGACACAGTGATTGGCAGCTCTACAAAAACCATGCTCCCCCGG
GACCCCGGGCTGTGGGTTTCTGTAGCCCCTGGCTCAGGGCTGACTCACCGTGGCTGAATACTTCCAGCAC TGAGCTGAGAACCACTGTG
TGGGGCCAGGGCACCCTGGTC ctaactgggga
CACAGTGATTGGCAGCTCTACAAAAACCATGCTCCCCCGG
GACCCCGGGCTGTGGGTTTCTGTAGCCCCTGGCTCAGGGC
TGACTCACCGTG
gctgaatacttccagcactggggccagggcaccctggtc
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
# Sequences outside any V(D)J locus # Sequences outside any V(D)J locus
>too_few_vj-1 >too_few_vj-1
CTAGGCATGGCTCCTCTCCACAGGAAAACTCCACTCCAGTGCTCAGCTTGCACCCTGGCACAGGCCAGCAGTTGCTGGAAGTCAGACACCTGTGAAGAAC CTAGGCATGGCTCCTCTCCACAGGAAAACTCCACTCCAGTGCTCAGCTTGCAaCCTGGCACAGGCCAGCAGTTGaTGGAAGTCcGACACCTGTGAAGAAC
>too_few_vj-2 >too_few_vj-2 #4728
GCCTCAGGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCGCACAGTGCTGGTTCCGTCACCCCCACCCAGGGAAGCAGGTCTGAGCAGCTTGTCCTGGCTG GCCTCAGGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCGCACAGTGCTGGTTCCGTCACCCCCACCCAGGGAAGCAGGTCTGAGCAGCTTGTCCTGGCTG
>too_few_vj-3 >too_few_vj-3
......
...@@ -37,4 +37,4 @@ cttgggcagttggaacaacACTTGTTGTCACAG ...@@ -37,4 +37,4 @@ cttgggcagttggaacaacACTTGTTGTCACAG
gaattattataagaaactctttggcagtggaacaacactggttgtcac gaattattataagaaactctttggcagtggaacaacactggttgtcac
>seq-seg-no-window >seq-seg-no-window
ctgggacaggGAATTATTAT ctgggacaggGAATTATTATA
...@@ -3,21 +3,33 @@ gcaccatcagagagagatgaagggtcttactactgtgcctgtgacacc ...@@ -3,21 +3,33 @@ gcaccatcagagagagatgaagggtcttactactgtgcctgtgacacc
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
# without DD2 upstream, the recombination can not be detected, 'TODO' is expected here
>TRDD2*01_0//0_TRDJ1*01 [TRD+] TODO >TRDD2*01_0//0_TRDJ1*01 [TRD+] TODO
ccttcctac ccttcctac
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
# same sequence, with DD2 upstream # same sequence, with some DD2 upstream
>TRDD2*01_0//0_TRDJ1*01 [TRD+]
TGTGccttcctac
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
# same sequence, with more DD2 upstream
>TRDD2*01_0//0_TRDJ1*01 [TRD+] >TRDD2*01_0//0_TRDJ1*01 [TRD+]
TGTGTTTCATTGTGccttcctac TGTGTTTCATTGTGccttcctac
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
# without DD2 upstream, the recombination can not be detected, 'TODO' is expected here
>TRDD2*01_0//0_TRDD3*01 [TRD+] TODO >TRDD2*01_0//0_TRDD3*01 [TRD+] TODO
ccttcctac ccttcctac
actgggggatacg actgggggatacg
# same sequence, with DD2 upstream and DD3 downstream # same sequence, with some DD2 upstream
>TRDD2*01_0//0_TRDD3*01 [TRD+]
GTGccttcctac
actgggggatacg
# same sequence, with more DD2 upstream and DD3 downstream
>TRDD2*01_0//0_TRDD3*01 [TRD+] >TRDD2*01_0//0_TRDD3*01 [TRD+]
TGTGTTTCATTGTGccttcctac TGTGTTTCATTGTGccttcctac
actgggggatacgCACAGTGCTA actgggggatacgCACAGTGCTA
......
...@@ -2,94 +2,117 @@ ...@@ -2,94 +2,117 @@
!LAUNCH: (cd $VIDJIL_DIR/germline ; md5sum */*.fa || md5 -r */*.fa) !LAUNCH: (cd $VIDJIL_DIR/germline ; md5sum */*.fa || md5 -r */*.fa)
$ Check md5 in germline/, sequences split and processed from germline and other databases $ Check md5 in germline/, sequences split and processed from germline and other databases
1:b64be21f03b290c6850ce2cb2f1d6f02 homo-sapiens/IGHD.fa 1:e749633dffa7c87299b46de3c81d8cfe gallus-gallus/IGHD.fa
1:6a64f90ea2d19721d410c545e9d0bb9e homo-sapiens/IGHJ.fa 1:c3738ee3e9fcc3c4cd242fc66df00299 gallus-gallus/IGHJ.fa
1:73e6adab4faeb48b4fc2904e3a6e90f8 homo-sapiens/IGHV.fa 1:1ddfe402c2f539ba297992a51dce322e gallus-gallus/IGHV.fa
1:84f820d3e567520266a1a5f0d28b37fc gallus-gallus/IGLJ.fa
1:d6b431de0803ab21db803f87fc838bf2 gallus-gallus/IGLV.fa
1:709721d770caefe417763c919ee2eeb1 homo-sapiens/IGHD.fa
1:b3f1ca96a21616dc5544584a7a7d8e14 homo-sapiens/IGHJ.fa
1:d70f82b45cce4e4571c8c7ef2a3c873e homo-sapiens/IGHV.fa
1:0367825e404f753f0890b8f52aec7502 homo-sapiens/IGKJ.fa 1:0367825e404f753f0890b8f52aec7502 homo-sapiens/IGKJ.fa
1:820188b335764f0eb04578ea35bbb143 homo-sapiens/IGKV.fa 1:5792d273ae377b1af2e69a3a8b47b14b homo-sapiens/IGKV.fa
1:af257e110cf1ec6c38457af82a8118aa homo-sapiens/IGLJ.fa 1:af257e110cf1ec6c38457af82a8118aa homo-sapiens/IGLJ.fa
1:d6fa96cbc4de984729154c9d2217e3d2 homo-sapiens/IGLV.fa 1:ebbe9236648bb903e9fec44752227a8a homo-sapiens/IGLV.fa
1:f0c43d7b0074e155aef411ea7353c23e homo-sapiens/TRAJ.fa 1:d50f2b2289fe2ca2e7139a47f6c68d3c homo-sapiens/TRAJ.fa
1:935d588445e94d575c412cb3699b6bff homo-sapiens/TRAV.fa 1:e2545c70e6b16c4076581becac5147be homo-sapiens/TRAV.fa
1:5b74170b9c45b9243558941bf07666ff homo-sapiens/TRBD.fa 1:5b74170b9c45b9243558941bf07666ff homo-sapiens/TRBD.fa
1:b9f8390d1d18a9ef5db9ab6875e196f1 homo-sapiens/TRBJ.fa 1:34015ed76474e4e5539ab4a76abee4c1 homo-sapiens/TRBJ.fa
1:6540dc6a0e4f84de208a8b1f5d9fa981 homo-sapiens/TRBV.fa 1:be0a91d9e0a571c1cef3cca3ebc38bd4 homo-sapiens/TRBV.fa
1:7f9fe8eaf781cf87453c157a771d5aaf homo-sapiens/TRDD.fa 1:7f9fe8eaf781cf87453c157a771d5aaf homo-sapiens/TRDD.fa
1:e50fc3c2f786f0b5a2b6fb5834dd3814 homo-sapiens/TRDJ.fa 1:e50fc3c2f786f0b5a2b6fb5834dd3814 homo-sapiens/TRDJ.fa
1:36dbb85d696634b8ee6ba0dc7500af85 homo-sapiens/TRDV.fa 1:bb5815c6edb747f593edd9414ed1deb6 homo-sapiens/TRDV.fa
1:552cbc4883f3524fdc871fc156ecfdde homo-sapiens/TRGJ.fa 1:62651e35ba6725757e30113647b5ab5f homo-sapiens/TRGJ.fa
1:e50df5cf337648190bc59609fe5fb5e0 homo-sapiens/TRGV.fa 1:ee4053a508b3ed4820799135cc50e554 homo-sapiens/TRGV.fa
1:6f585362d727b243dd284ede15118670 mus-musculus/IGHD.fa 1:b3bca64b9f9a9891b70590a32aed9baa mus-musculus/IGHD.fa
1:218632e3e3c4fcd4c9ca7281e8976e3a mus-musculus/IGHJ.fa 1:218632e3e3c4fcd4c9ca7281e8976e3a mus-musculus/IGHJ.fa
1:d389ee1e204699942ec1185fad8cd3df mus-musculus/IGHV.fa 1:85d7fbbe3372dbd0d1238089fe555a7c mus-musculus/IGHV.fa
1:47568ad1ee648410b734c3b33f4a9eea mus-musculus/IGKJ.fa 1:47568ad1ee648410b734c3b33f4a9eea mus-musculus/IGKJ.fa
1:e2774dbad5e73f3b0c1e46aeab285baa mus-musculus/IGKV.fa 1:b1225a12bb1be697517b6a45e039df2b mus-musculus/IGKV.fa
1:750276a78c3b55f378449ecdcb9c3f78 mus-musculus/IGLJ.fa 1:750276a78c3b55f378449ecdcb9c3f78 mus-musculus/IGLJ.fa
1:4569e14c967a09a4313532f98b7c23f2 mus-musculus/IGLV.fa 1:4569e14c967a09a4313532f98b7c23f2 mus-musculus/IGLV.fa
1:2254199cc1a79035e9e0960cc054e9f5 mus-musculus/TRAJ.fa 1:ade032044c85866d7ef93499729e74e7 mus-musculus/TRAJ.fa
1:f32b1dea13dfeb372719d5754d7a1e38 mus-musculus/TRAV.fa 1:5b29aa356acb97c45d2c433d4eda3798 mus-musculus/TRAV.fa
1:28f7e79cf365e2e761990104a6f9f008 mus-musculus/TRBD.fa 1:28f7e79cf365e2e761990104a6f9f008 mus-musculus/TRBD.fa
1:b5130bb3d46fe1b8e0d04d79f9a706d4 mus-musculus/TRBJ.fa 1:b5130bb3d46fe1b8e0d04d79f9a706d4 mus-musculus/TRBJ.fa
1:839fcda41f0217c27c6e283616044a88 mus-musculus/TRBV.fa 1:d11e3c9b8c51d285fb73ca69abae9c0c mus-musculus/TRBV.fa
1:9d78372e69a7717a25265ed63b31789d mus-musculus/TRDD.fa 1:9d78372e69a7717a25265ed63b31789d mus-musculus/TRDD.fa
1:f030e2b8844f0751b8026d82a0efe361 mus-musculus/TRDJ.fa 1:f030e2b8844f0751b8026d82a0efe361 mus-musculus/TRDJ.fa
1:21a12a2e97ce204f1197e8337e910c62 mus-musculus/TRDV.fa 1:5a1f5e3b7062dd1344b51f87cd1aea3d mus-musculus/TRDV.fa
1:d382cbae8cfa86239b781f2e67034eb9 mus-musculus/TRGJ.fa 1:d382cbae8cfa86239b781f2e67034eb9 mus-musculus/TRGJ.fa
1:982c0fc1208d066bc028621e94d2b466 mus-musculus/TRGV.fa 1:982c0fc1208d066bc028621e94d2b466 mus-musculus/TRGV.fa
1:d55f4acf266d3bae4a7f9b3aa1881abc rattus-norvegicus/IGHD.fa 1:d55f4acf266d3bae4a7f9b3aa1881abc rattus-norvegicus/IGHD.fa
1:07fa1dbe7a70f34c9a4e42ba9e9d7ca1 rattus-norvegicus/IGHJ.fa 1:07fa1dbe7a70f34c9a4e42ba9e9d7ca1 rattus-norvegicus/IGHJ.fa
1:877b03603ec7e2a99d10184c3636f635 rattus-norvegicus/IGHV.fa 1:9f0bb196fda4073e1e69d8b2799e0fb1 rattus-norvegicus/IGHV.fa
1:c5ca90bea438f929308c5d57dfb1dc6b rattus-norvegicus/IGKV.fa 1:c5ca90bea438f929308c5d57dfb1dc6b rattus-norvegicus/IGKV.fa
1:96bc9d75e6b072e5d643c195ed562497 rattus-norvegicus/IGLJ.fa 1:96bc9d75e6b072e5d643c195ed562497 rattus-norvegicus/IGLJ.fa
1:7e5c54685597270f333b41b70aef74ad rattus-norvegicus/IGLV.fa 1:7e5c54685597270f333b41b70aef74ad rattus-norvegicus/IGLV.fa
1:7f142ad18501018b200f3ad23996aea4 sus-scrofa/IGHD.fa
1:517a9fbb2b504af7daa51298e6653d84 sus-scrofa/IGHJ.fa
1:c4c54e9f779e85a8991d0bd837e047b7 sus-scrofa/IGHV.fa
1:f332aec9ffc0cf4726fae65a7f457a0e sus-scrofa/IGKJ.fa
1:f6207c360c4d527238931d22de781693 sus-scrofa/IGKV.fa
1:5c87360043c5827f3f3d2f3675799aa8 sus-scrofa/IGLJ.fa
1:3963266e4e77b54beb791e6b223bc07c sus-scrofa/IGLV.fa
1:9500ddf75661750847ba78c35997bdba sus-scrofa/TRBD.fa
1:bdfaa02564c514606719d895ee5d9e55 sus-scrofa/TRBJ.fa
1:09b9c7883fa22098410dad284c0bc0f2 sus-scrofa/TRBV.fa
$ Check md5 in germline/, other sequences $ Check md5 in germline/, other sequences
1:957b46da4114a1ed66f2e9b5d06aff2a homo-sapiens/CD.fa 1:8f6428463ffd8ba9bbc44117e6c8f44f gallus-gallus/IGHC=A.fa
1:9112d6975669ccb59970fa79ceef599d homo-sapiens/CD-sorting.fa 1:209836b81e476af167093bd211ed8f8a gallus-gallus/IGHC=M.fa
1:eb32e780af5a4b8c0d1e9d780bacac43 homo-sapiens/IGHC=A1.fa 1:4761d471e62fa09fcad4cd4733849bdd gallus-gallus/IGHC=Y.fa
1:e749633dffa7c87299b46de3c81d8cfe gallus-gallus/IGHD.up.fa
1:c3738ee3e9fcc3c4cd242fc66df00299 gallus-gallus/IGHJ.down.fa
1:84f820d3e567520266a1a5f0d28b37fc gallus-gallus/IGLJ.down.fa
1:af41c26c0b8f7703e620bde67dc9071f homo-sapiens/CD.fa
1:f8590ae92973e11ffbfffc7c4821eb96 homo-sapiens/CD-sorting.fa
1:ae554d1a1a8fdc1e19ed6b84c28fc4d1 homo-sapiens/IGHC=A1.fa
1:b1ea36c4255c63d775ecdb03967ec89e homo-sapiens/IGHC=A2.fa 1:b1ea36c4255c63d775ecdb03967ec89e homo-sapiens/IGHC=A2.fa
1:43c54f3ddedfde87f70b0e0bea2d6d5f homo-sapiens/IGHC=D.fa 1:43c54f3ddedfde87f70b0e0bea2d6d5f homo-sapiens/IGHC=D.fa
1:08eefd2a7ef02bd51fff8cdb8df885a5 homo-sapiens/IGHC=E.fa 1:9801ddd8ac1e92d31b0e5f6bd2a4c81b homo-sapiens/IGHC=E.fa
1:1b35f9976696e50bdd614cdcdaabb3d5 homo-sapiens/IGHC=G1.fa 1:42d0de00d032003a7f81f04ab7083c7d homo-sapiens/IGHC=EP1.fa
1:da53f3bfeabd7f9025cb621d0ce65740 homo-sapiens/IGHC=G2.fa 1:e75afc5d4faef2db85cd1fd162a36f04 homo-sapiens/IGHC=EP2.fa
1:85deb421f44fdd01d50c6979c8c21de1 homo-sapiens/IGHC=G3.fa 1:f823495f04bb21a3c574f8a7bb122731 homo-sapiens/IGHC=G1.fa
1:d5123a856e2dedacfcfcd5fd11ebc2bc homo-sapiens/IGHC=G4.fa 1:c09f24dfdc80354367e7c0b124e3e69f homo-sapiens/IGHC=G2.fa
1:6c1e3e61d5824a03f71a5527ccdbb35c homo-sapiens/IGHC=GP.fa 1:012c6296db4088b500fc1f6304ad6cb9 homo-sapiens/IGHC=G3.fa
1:dd2263230c64abb9a22f5f8b7a979275 homo-sapiens/IGHC=M.fa 1:2e77b8d0211efa73056e544851292299 homo-sapiens/IGHC=G4.fa
1:f42358bad835968bdb934f3269d6a059 homo-sapiens/IGHD.up.fa 1:7895e53db66d62a3c8ab4dbe22badf0f homo-sapiens/IGHC=GP.fa
1:d130c72bbcd819c3967a801b34d72880 homo-sapiens/IGHJ.down.fa 1:1a780d50f3a3a01d658a70711f77cb30 homo-sapiens/IGHC=M.fa
1:c9225cb408d44457ffa55ad94673260c homo-sapiens/IGHD.up.fa
1:248b4d7edf22bbb1cf0cdbb29c40779e homo-sapiens/IGHJ.down.fa
1:7d6247262807ad5478b7a4e52a59c568 homo-sapiens/IGK-INTRON.fa 1:7d6247262807ad5478b7a4e52a59c568 homo-sapiens/IGK-INTRON.fa
1:1db120e1709fdc20f4f87da49c6dbf01 homo-sapiens/IGKJ.down.fa
1:1147a04c4e8a8dd534aae65db1ae13ca homo-sapiens/IGK-KDE.fa 1:1147a04c4e8a8dd534aae65db1ae13ca homo-sapiens/IGK-KDE.fa
1:57796ec5e54fd9e83ab0a21ebd3c5427 homo-sapiens/IGKJ.down.fa 1:6e4f346dcd13c2d6d2ca6d977baf27c7 homo-sapiens/IGLJ.down.fa
1:87bfb83aafaabc066f68e90252acc245 homo-sapiens/IGLJ.down.fa 1:85e5405f33cb06a791823fb118c185bd homo-sapiens/TRAJ.down.fa
1:785430ce269ad6f8b0568815db3520bd homo-sapiens/TRAJ.down.fa 1:425f89eeb5b10e6c43a6188821ccb3cf homo-sapiens/TRBD.up.fa
1:04cecd3f430f41637483f9a39a257930 homo-sapiens/TRBD.up.fa 1:e5336d3a684811c103e98a65dfeceb4b homo-sapiens/TRBJ.down.fa
1:8345f39003090908e8812c632c82faa6 homo-sapiens/TRBJ.down.fa 1:7b05452a3f2ed14ec8aef817d2e72576 homo-sapiens/TRDD2.up.fa
1:fb4303727d2713a17882bcc2518bfa6e homo-sapiens/TRDD.up.fa 1:a9ca08c7c399072ca8061ea590431d67 homo-sapiens/TRDD3.down.fa
1:415f4d57ea81e3bf3c726df6455df0ea homo-sapiens/TRDD2.up.fa 1:0a84d2a0fb4c270e93e9ed125dfce114 homo-sapiens/TRDD.up.fa
1:4b2414f77dd03fc93b84d3e01ca3a6e4 homo-sapiens/TRDD3.down.fa 1:0353a5f20aa88917b49203b5c02b5064 homo-sapiens/TRDJ.down.fa
1:d68b0bf8ef69e57599339ef82654f92e homo-sapiens/TRDJ.down.fa 1:a296dfd39749e405d48a99ded8b632d2 homo-sapiens/TRGJ.down.fa
1:35ad2bde761082b71b5cecb1bdc5560a homo-sapiens/TRGJ.down.fa 1:72ea26b35dbd1170fb4998d03f739ef8 mus-musculus/IGHC=A.fa
1:48f54c5af4f8d748ab2f02b2a6ea3370 mus-musculus/IGHC=A.fa
1:25cb0451d31ca5337d462eb648914fe7 mus-musculus/IGHC=D.fa 1:25cb0451d31ca5337d462eb648914fe7 mus-musculus/IGHC=D.fa
1:8f2ec40bc77a4e7f0c7b19655812ddbc mus-musculus/IGHC=E.fa 1:8f2ec40bc77a4e7f0c7b19655812ddbc mus-musculus/IGHC=E.fa
1:f5a37575fa7003b31dbc0526ede9200d mus-musculus/IGHC=G1.fa 1:8d80b5406bb2857da36576095f8ca702 mus-musculus/IGHC=G1.fa
1:a628bf350fdb667fe35546c8ba8a0ff3 mus-musculus/IGHC=G2A.fa 1:a628bf350fdb667fe35546c8ba8a0ff3 mus-musculus/IGHC=G2A.fa
1:f3bd58c436dd8f4b7e4f11f408a8ce5d mus-musculus/IGHC=G2B.fa 1:f3bd58c436dd8f4b7e4f11f408a8ce5d mus-musculus/IGHC=G2B.fa
1:03cd9b5729ec3de23b302c476a51c9d3 mus-musculus/IGHC=G2C.fa 1:03cd9b5729ec3de23b302c476a51c9d3 mus-musculus/IGHC=G2C.fa
1:744e59a4b5d1fc53beae91a0bee8f34a mus-musculus/IGHC=G3.fa 1:744e59a4b5d1fc53beae91a0bee8f34a mus-musculus/IGHC=G3.fa
1:b72bc2ed53226dd68519bb0cb679f608 mus-musculus/IGHC=M.fa 1:b72bc2ed53226dd68519bb0cb679f608 mus-musculus/IGHC=M.fa
1:6f585362d727b243dd284ede15118670 mus-musculus/IGHD.up.fa 1:b3bca64b9f9a9891b70590a32aed9baa mus-musculus/IGHD.up.fa
1:966da2d97bd49269fbbeab792e104bb6 mus-musculus/IGHJ.down.fa 1:6e1c570e0e7c47b6d970ee7360fd8d6f mus-musculus/IGHJ.down.fa
1:ebd3f089c352f9a095ceef9f6486a958 mus-musculus/IGKJ.down.fa 1:25e80d695e78452da46e398b08c64727 mus-musculus/IGKJ.down.fa
1:750276a78c3b55f378449ecdcb9c3f78 mus-musculus/IGLJ.down.fa 1:750276a78c3b55f378449ecdcb9c3f78 mus-musculus/IGLJ.down.fa
1:833e114425d61a04dbc5064988c5b35b mus-musculus/TRAJ.down.fa 1:aac472f2e1ba49ff63fc19747f82d16c mus-musculus/TRAJ.down.fa
1:28f7e79cf365e2e761990104a6f9f008 mus-musculus/TRBD.up.fa 1:28f7e79cf365e2e761990104a6f9f008 mus-musculus/TRBD.up.fa
1:b5130bb3d46fe1b8e0d04d79f9a706d4 mus-musculus/TRBJ.down.fa 1:b5130bb3d46fe1b8e0d04d79f9a706d4 mus-musculus/TRBJ.down.fa
1:9d78372e69a7717a25265ed63b31789d mus-musculus/TRDD.up.fa
1:07f2157d5e1d88e54404c34aa7cb9510 mus-musculus/TRDD2.up.fa 1:07f2157d5e1d88e54404c34aa7cb9510 mus-musculus/TRDD2.up.fa
1:9d78372e69a7717a25265ed63b31789d mus-musculus/TRDD.up.fa
1:f030e2b8844f0751b8026d82a0efe361 mus-musculus/TRDJ.down.fa 1:f030e2b8844f0751b8026d82a0efe361 mus-musculus/TRDJ.down.fa
1:7a8167084825ea164f97bb2bd5d01241 mus-musculus/TRGJ.down.fa 1:4dcc9211298695bf84dda74dc51eccab mus-musculus/TRGJ.down.fa
1:90075c7d51bd1d08892163848a8f2ecc rattus-norvegicus/IGHC=A.fa 1:90075c7d51bd1d08892163848a8f2ecc rattus-norvegicus/IGHC=A.fa
1:6cf27ae32f98ec8f912e0d07b46a2139 rattus-norvegicus/IGHC=D.fa 1:6cf27ae32f98ec8f912e0d07b46a2139 rattus-norvegicus/IGHC=D.fa
1:8461bb440ad2264eb638d948bda186ab rattus-norvegicus/IGHC=E.fa 1:8461bb440ad2264eb638d948bda186ab rattus-norvegicus/IGHC=E.fa
...@@ -101,3 +124,21 @@ $ Check md5 in germline/, other sequences ...@@ -101,3 +124,21 @@ $ Check md5 in germline/, other sequences
1:d55f4acf266d3bae4a7f9b3aa1881abc rattus-norvegicus/IGHD.up.fa 1:d55f4acf266d3bae4a7f9b3aa1881abc rattus-norvegicus/IGHD.up.fa
1:07fa1dbe7a70f34c9a4e42ba9e9d7ca1 rattus-norvegicus/IGHJ.down.fa 1:07fa1dbe7a70f34c9a4e42ba9e9d7ca1 rattus-norvegicus/IGHJ.down.fa
1:96bc9d75e6b072e5d643c195ed562497 rattus-norvegicus/IGLJ.down.fa 1:96bc9d75e6b072e5d643c195ed562497 rattus-norvegicus/IGLJ.down.fa
1:696957833fb4591506edc89ad8cadb23 sus-scrofa/IGHC=A.fa
1:692132ec79c972b8d3ca7b7a11d3bb62 sus-scrofa/IGHC=D.fa
1:6149d53fe586c3ea88a49761be09d5c4 sus-scrofa/IGHC=E.fa
1:46211082d0ce38d222a62a1bd03b8d41 sus-scrofa/IGHC=G1.fa
1:aa963a502eed1732ac3900c42f68aeb4 sus-scrofa/IGHC=G2.fa
1:a8bbe1fd20ae86ce5e2c9cffe862cfeb sus-scrofa/IGHC=G3.fa
1:3147109ed4864c7dddd6959810266799 sus-scrofa/IGHC=G4.fa
1:801c30b05161dba0566ccd597900eddc sus-scrofa/IGHC=G5-1.fa
1:64c2bbd3bf9879231533628294211781 sus-scrofa/IGHC=G5-2.fa
1:dacb595961680617a5d4092ab9465453 sus-scrofa/IGHC=G6-1.fa
1:f1f0467a568aadc3e7ce4400aa5c27ba sus-scrofa/IGHC=G6-2.fa
1:88018d59946930409050d2cea47ac98b sus-scrofa/IGHC=M.fa
1:7f142ad18501018b200f3ad23996aea4 sus-scrofa/IGHD.up.fa
1:517a9fbb2b504af7daa51298e6653d84 sus-scrofa/IGHJ.down.fa
1:f332aec9ffc0cf4726fae65a7f457a0e sus-scrofa/IGKJ.down.fa
1:5c87360043c5827f3f3d2f3675799aa8 sus-scrofa/IGLJ.down.fa
1:9500ddf75661750847ba78c35997bdba sus-scrofa/TRBD.up.fa
1:bdfaa02564c514606719d895ee5d9e55 sus-scrofa/TRBJ.down.fa
...@@ -7,5 +7,5 @@ ...@@ -7,5 +7,5 @@
$ Same url with "get-saved-germline" (using "germline_id") and inside "homo-sapiens.g" $ Same url with "get-saved-germline" (using "germline_id") and inside "homo-sapiens.g"
1:Diff: 0 1:Diff: 0
$ This url has about 44 characters $ This url has 52 characters
1: 4[45] url-1 1: 52 url-1
!LAUNCH: $VIDJIL_DIR/$EXEC -c clones -z 2 -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.tsv !LAUNCH: $VIDJIL_DIR/$EXEC -c clones -z 2 -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.tsv
$ There are four lines, all with tabs $ There are at least four lines, all with tabs
4: >4:
4:\t >4:\t
$ The required AIRR fields are present $ The required AIRR fields are present
:v_call d_call j_call :v_call d_call j_call
...@@ -22,12 +22,12 @@ $ Optional AIRR fields ...@@ -22,12 +22,12 @@ $ Optional AIRR fields
$ Custom AIRR fields $ Custom AIRR fields
:warnings :warnings
$ Three clones on IGH $ At least three clones on IGH
3:IGH >3:IGH
$ One clone has 8 reads, two clones have 5 reads $ One clone has 8 reads, two clones have 5 reads
1:IGH 8 1:IGH 8
2:IGH 5 >2:IGH 5
$ Two clones are productive (productive, vj_in_frame, NOT stop_codon), no clone is labeled as non-productive $ Two clones are productive (productive, vj_in_frame, NOT stop_codon), no clone is labeled as non-productive
2:\tT\tT\tF\t 2:\tT\tT\tF\t
...@@ -50,8 +50,8 @@ bf1: 1 72 91 102 103 147 ...@@ -50,8 +50,8 @@ bf1: 1 72 91 102 103 147
$ cdr3 start/end positions of the first clone $ cdr3 start/end positions of the first clone
b1: 69 137 b1: 69 137
$ V/J e-values $ V/J e-values of the first clone
rb1: 0[.][0-9]*e.00 7[.][0-9]*e[-]76 rb1: 1[.][0-9]*e[-]67 8[.][0-9]*e[-]25
$ The first clone has one warning $ The first clone has one warning
1:TATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTG .* W69 1:TATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTG .* W69
......
...@@ -5,22 +5,22 @@ $ Germlines are custom ...@@ -5,22 +5,22 @@ $ Germlines are custom
1: custom germlines 1: custom germlines
$ Parses IGHV.fa germline $ Parses IGHV.fa germline
1: 104369 bp in 357 sequences 1: 156380 bp in 550 sequences
$ Parses IGHD.fa germline $ Parses IGHD.fa germline
1: 1070 bp in 44 sequences 1: 1070 bp in 44 sequences
$ Parses germline/homo-sapiens/IGHJ.fa $ Parses IGHJ.fa germline
1: 701 bp in 13 sequences 1: 1034 bp in 19 sequences
$ Find the good index loads $ Find the good index loads
1:custom .* 0.078% l13 k12 .* 0.002% l13 k12 1:custom .* 0.149% l13 k12 .* 0.003% l13 k12
$ Find approximately the good number of sequences for e-value computation $ Find approximately the good number of sequences for e-value computation
1: approx. 131.. sequences 1: approx. 131.. sequences
$ Find the good number of windows in Stanford S22 $ Find the good number of windows in Stanford S22
1: found 10767 windows in 13153 reads