Commit c8ca20a3 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-ag/3372-imgt-sequence-differences' into 'dev'

Use same sequences for all germlines

Closes #3372 and #3373

See merge request !250
parents 8694c08c e94203f7
Pipeline #33102 passed with stages
in 49 seconds
>TRDD2 28/CAA/2 TRDD3 [TRD+]
AGCGGGTGGTGATGGCAAAGTGCCAAGGAAAGGGAAAAAGGAAGAAGAGGGTTTTTATCAATGGGGGATACGCACAGTGCTACAAAACCTACAGAGACCTGTACAAAAACTGCAGGGGCAAAAGTGCCATTTCCCTGGGATATCCTCACCCTGGGTCCCA
!LAUNCH: $VIDJIL_DIR/$EXEC -e 1e-5 -g $VIDJIL_DIR/germline -r 1 bug3009.fa
$ Segmented as TRD+
1: TRD\+ -> 1
1: SEG_\+ -> 1
$ VDJ assignation
1:TRDD2.01 28/CAA/2 TRDD3.01
#!/bin/sh
cd $(dirname $0)
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
errors=$(tempfile)
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList 2> $errors
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-INTRON.fa
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-KDE.fa
cat $errors
rm -f $errors
......@@ -10,7 +10,7 @@ API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
API_EUTILS += 'api_key='+os.environ['NCBI_KEY']+'&' if 'NCBI_KEY' in os.environ else ''
API_NUCCORE_ID = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s&strand=%d'
API_GENE_ID_XML = API_EUTILS + 'db=gene&retmode=xml&rettype=docsum' + '&id=%s'
......@@ -25,12 +25,19 @@ def get_gene_sequence(gene, other_gene_name, start, end, additional_length):
'''
Return the gene sequences between positions start and end (included).
'''
reversed = False
if end < start:
tmp = end
end = start
start = tmp
reversed = True
if additional_length > 0:
end += additional_length
elif additional_length < 0:
start = max(1, start + additional_length)
fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end, 2 if reversed else 1)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
def ncbi_and_write(ncbi, additional_header, outs):
......@@ -69,16 +76,9 @@ def get_updownstream_sequences(gene, start, end, additional_length):
end = start - 1 * reversed
start = max(1, start + additional_length * reversed)
if start > end:
tmp = start
start = end
end = tmp
updown_fasta = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
updown_fasta = get_gene_sequence(gene, '', start, end, 0)
updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip()
if reversed == -1:
updown_raw = fasta.revcomp(updown_raw.upper())
if additional_length > 0:
return ('', updown_raw)
......
......@@ -10,6 +10,8 @@ import re
import ncbi
GENES_SEQ_FROM_NCBI = False
IMGT_LICENSE = '''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
......@@ -75,7 +77,7 @@ def get_gene_coord(imgt_line):
>>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
>>> get_gene_coord(line)[0] == 'X15272'
True
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01', 'species': 'Homo sapiens'}
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01', 'species': 'Homo sapiens', 'seq': ''}
True
'''
elements = imgt_line.split('|')
......@@ -91,7 +93,8 @@ def get_gene_coord(imgt_line):
'to': int(end),
'species': elements[2],
'imgt_name': elements[1],
'imgt_data': '|'.join(elements[1:5])}
'imgt_data': '|'.join(elements[1:5]),
'seq': ''}
def paste_updown_on_fasta(fasta, up, down):
'''
......@@ -107,12 +110,35 @@ def paste_updown_on_fasta(fasta, up, down):
return lines[0]+'\n' + (up+'\n' if up else '') + '\n'.join(filter(None, lines[1:])) + '\n'\
+ (down+'\n' if down else '')
def check_imgt_ncbi_consistency(imgt_info, imgt_data, ncbi_target, ncbi_start, ncbi_end):
if abs(imgt_info['from'] - imgt_info['to']) != abs(ncbi_start - ncbi_end):
print >>sys.stderr,"WARNING: Length for %s differ between IMGT (%d) and NCBI (%d)" % (imgt_info['imgt_name'], abs(imgt_info['from'] - imgt_info['to'])+1, abs(ncbi_start - ncbi_end)+1)
else:
# Check that sequences are identical
ncbi_seq = ncbi.get_gene_sequence(ncbi_target, '', ncbi_start, ncbi_end, 0).split('\n')[1:]
gene_lines = imgt_data.split('\n')[1:]
if gene_lines[0].startswith('#'):
gene_lines = gene_lines[1:]
imgt_seq = ''.join(gene_lines).upper().replace('.', '')
ncbi_seq = ''.join(ncbi_seq).upper()
if imgt_seq != ncbi_seq:
print >>sys.stderr, "WARNING: Sequences for %s differ between IMGT and NCBI\n%s" % (imgt_info['imgt_name'], imgt_seq)
for i, letter in enumerate(ncbi_seq):
if letter == imgt_seq[i]:
sys.stderr.write('.')
else:
sys.stderr.write(letter)
sys.stderr.write('\n')
def store_data_if_updownstream(fasta_header, path, data, genes):
paths = [] # A given sequence can be stored in several files
for gene in gene_matches(fasta_header, genes):
gene_name, gene_coord = get_gene_coord(fasta_header)
if gene_name:
data[path+'/'+gene].append((gene_name, gene_coord))
paths.append(path+'/'+gene)
return paths
def retrieve_genes(f, genes, tag, additional_length, gene_list):
for info in genes:
......@@ -125,16 +151,26 @@ def retrieve_genes(f, genes, tag, additional_length, gene_list):
if gene_id:
try:
(target, start, end) = ncbi.get_gene_positions(gene_id)
print(coord, gene_id, target, start, end)
except KeyError:
print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord)))
allele_additional_length = additional_length
gene_id = None
# extract from gene
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length)
# gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
# gene_id: is the NCBI ID of the VDJ gene
# target: is the NCBI ID of the chromosome
if GENES_SEQ_FROM_NCBI:
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length)
else:
# IMGT
gene_data = coord['seq']
if gene_id:
# Check consistency for *01 allele
if coord['imgt_name'].endswith('*01'):
check_imgt_ncbi_consistency(coord, gene_data, target, start, end)
up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
......@@ -285,6 +321,7 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
if ">" in l:
current_files = []
current_special = None
key_upstream, key_downstream = ([], [])
species = l.split('|')[2].strip()
feature = l.split('|')[4].strip()
......@@ -308,8 +345,8 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
if system.startswith('IG') or system.startswith('TR'):
if feature in FEATURES_VDJ:
store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
key_downstream = store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
key_upstream = store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
systems = get_split_files(seq, SPLIT_SEQUENCES)
if systems:
......@@ -322,6 +359,12 @@ def split_IMGTGENEDBReferenceSequences(f, gene_list):
current_special = verbose_open_w(name)
for key in key_downstream:
downstream_data[key][-1][1]['seq'] += l
for key in key_upstream:
upstream_data[key][-1][1]['seq'] += l
# Possibly gap J_REGION
if '>' not in l and current_files and feature == FEATURE_J_REGION:
......
......@@ -2,4 +2,4 @@
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
......@@ -5,4 +5,4 @@ $ Correct full header, with TRDD2*01 identifier between pipes
f1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
$ Correct sequence, with upstream
1: AAGAGGGTTTTTATACTGATGTGTTTCATTGTGCCTTCCTAC
i1: AAGAGGGTTTTTATACTGATGTGTTTCATTGTGCCTTCCTAC
......@@ -26,6 +26,9 @@ if not (sys.version_info >= (3, 4)):
print("Python >= 3.4 required")
sys.exit(1)
__version_info__ = ('1','0','0+dev')
__version__ = '.'.join(__version_info__)
import re
import argparse
import subprocess
......@@ -63,6 +66,7 @@ VAR_EXTRA = '$EXTRA'
MOD_TODO = 'f'
MOD_REGEX = 'r'
MOD_COUNT_ALL = 'w'
MOD_IGNORE_CASE = 'i'
MOD_BLANKS = 'b'
MOD_MULTI_LINES = 'l'
MOD_KEEP_LEADING_TRAILING_SPACES = 'z'
......@@ -151,6 +155,7 @@ MODIFIERS = [
(MOD_TODO, 'todo', 'consider that the test should fail'),
(MOD_REGEX, 'regex', 'consider as a regular expression'),
(MOD_COUNT_ALL, 'count-all', 'count all occurrences, even on a same line'),
(MOD_IGNORE_CASE, 'ignore-case', 'ignore case changes'),
(MOD_BLANKS, 'blanks', "ignore whitespace differences as soon as there is at least one space. Implies 'r'"),
(MOD_MULTI_LINES, 'multi-lines', 'search on all the output rather than on every line'),
(MOD_KEEP_LEADING_TRAILING_SPACES, 'ltspaces', 'keep leading and trailing spaces'),
......@@ -204,6 +209,9 @@ parser = ArgParser(description='Test command-line applications through .should f
%(prog)s demo/hello.should''',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
options = ArgParser(fromfile_prefix_chars='@') # Can be used in !OPTIONS: directive
for p in (parser, options):
......@@ -526,7 +534,13 @@ class TestCase(TestCaseAbstract):
self.expression = self.expression.replace(' ', ' ')
self.expression = self.expression.replace(' ', '\s+')
self.mods.regex = True
self.regex = re.compile(self.expression) if self.mods.regex else None
self.regex = None
if self.mods.regex:
if self.mods.ignore_case:
self.regex = re.compile(self.expression, re.IGNORECASE)
else:
self.regex = re.compile(self.expression)
def test(self, lines, variables=None, verbose=0):
if self.mods.multi_lines:
......@@ -534,6 +548,9 @@ class TestCase(TestCaseAbstract):
expression_var = replace_variables(self.expression, variables)
if not self.regex and self.mods.ignore_case:
expression_var = expression_var.upper()
self.count = 0
for l in lines:
if self.regex:
......@@ -541,8 +558,11 @@ class TestCase(TestCaseAbstract):
self.count += len(self.regex.findall(l))
elif self.regex.search(l):
self.count += 1
elif expression_var in l:
self.count += l.count(expression_var) if self.mods.count_all else 1
else:
if self.mods.ignore_case:
l = l.upper()
if expression_var in l:
self.count += l.count(expression_var) if self.mods.count_all else 1
if self.expected_count is None:
self.status = (self.count > 0)
......@@ -566,7 +586,9 @@ class TestCase(TestCaseAbstract):
s = ''
if self.status in WARN_STATUS or verbose:
s += ' (%s/%s)' % (self.count, self.expected_count if self.expected_count is not None else '+')
s += ' (%s/%s%s)' % (self.count,
MOD_LESS_THAN if self.mods.less_than else MOD_MORE_THAN if self.mods.more_than else '',
self.expected_count if self.expected_count is not None else '+')
return s
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment