Commit 8cd9a0a6 authored by Mikaël Salson's avatar Mikaël Salson
Browse files

Merge branch 'feature-g/4655-4656-updownstreams' into 'dev'

Tuning germline up/downstreams

Closes #4647 and #4656

See merge request !891
parents 418358bf 03b0b9d5
Pipeline #231253 passed with stages
in 8 minutes and 50 seconds
......@@ -31,7 +31,7 @@ diff-from-saved:
diff -r -u -x "*[.][^f][^a]" -x "germline*" -x "get*" -x "Makefile" -x "saved-*" saved-germline/ .
tests:
python split-from-imgt.py --test
python split-germlines.py --test
make -C tests
......
......@@ -62,6 +62,9 @@ def parse_as_Fasta(fasta):
for (header, sequence) in parse(fasta):
yield Fasta(header, sequence)
def verbose_open(name):
print (" <== %s" % name)
return open(name)
class Fasta():
......
......@@ -5,13 +5,14 @@ wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
wget -N http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP
errors=$(tempfile)
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP IMGTGENEDB-GeneList 2> $errors
LOG=split-germlines.log
ERR=split-germlines.err
(python split-germlines.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+allP IMGTGENEDB-GeneList | tee $LOG) 2> $ERR
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-INTRON.fa
wget -N -P homo-sapiens http://vidjil.org/germline/IGK-KDE.fa
cat $errors
rm -f $errors
cat $ERR
#!env python3
import glob
import fasta
import re
from collections import defaultdict
GERMLINES = glob.glob('homo-sapiens/IG*+*.fa') + glob.glob('homo-sapiens/TR*+*.fa')
POS = re.compile('[(]?(\d+)[.][.][(]?\d+[)]?[.][.](\d+)[)]?')
positions = defaultdict(lambda: defaultdict(list))
def tag(gene, ref, first, last):
'''
Remembers which positions on the genome are covered by a germline gene
'''
for i in range(min(first, last), max(first, last) + 1):
if gene not in positions[ref][i]:
positions[ref][i] += [gene]
def locus_map(genes):
for g in genes:
for header, seq in fasta.parse(fasta.verbose_open(g)):
try:
name = header.split('|')[1]
ref, pos = (header.split('#')[1]).split('|')
# print(ref, pos, name)
m = POS.match(pos)
assert(m)
first = int(m.group(1))
last = int(m.group(2))
gene_without_allele = name.split('*')[0] if '*' in name else name
tag(gene_without_allele, ref, first, last)
except:
print('!', header)
def show_overlaps():
print('## Check for overlaps')
for ref, pos in positions.items():
print('==', ref)
overlaps = defaultdict(list)
for (i, genes) in pos.items():
if len(genes) > 1:
overlaps[' '.join(genes)] += [i]
for (genes, ii) in overlaps.items():
print('!! Overlap %s %d..%d between %s' % (ref, min(ii), max(ii), genes))
locus_map(GERMLINES)
show_overlaps()
\ No newline at end of file
......@@ -19,7 +19,7 @@ from xml.dom import minidom, Node
# The two following functions should be refactored as one (used in split-from-imgt and get-CD)
# The two following functions should be refactored as one (used in split-germlines and get-CD)
def get_gene_sequence(gene, other_gene_name, start, end, additional_length):
'''
......@@ -63,27 +63,34 @@ def get_updownstream_sequences(gene, start, end, additional_length):
recover. When additional_length > 0 we get the downstream
region, and conversely when additional_length < 0.
:return: A tuple whose first element is the upstream region (or empty)
and where the second element is the downstream region (or empty
and where the second element is the downstream region (or empty),
then a string carrying information
'''
info = '(%d..%d)' % (start, end)
if additional_length == 0:
return ('', '')
return (('', ''), info)
reversed = -1 if (end < start) else 1
if additional_length > 0:
start = end + 1 * reversed
end = end + additional_length * reversed
info = info + '..%d' % end
elif additional_length < 0:
end = start - 1 * reversed
start = max(1, start + additional_length * reversed)
info = '%d..' % start + info
updown_fasta = get_gene_sequence(gene, '', start, end, 0)
updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip()
gene_info = "%s|%s" % (gene, info)
if additional_length > 0:
return ('', updown_raw)
return (('', updown_raw), gene_info)
else:
return (updown_raw, '')
return ((updown_raw, ''), gene_info)
# Parse output from API_GENE_ID_XML to get genomic positions of a gene
......
......@@ -96,18 +96,18 @@ def get_gene_coord(imgt_line):
'imgt_data': '|'.join(elements[1:5]),
'seq': ''}
def paste_updown_on_fasta(fasta, up, down):
def paste_updown_on_fasta(fasta, up, down, info=''):
'''
Put upstream and/or downstream data on an existing FASTA sequences
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', 'GGGG')
'>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', 'GGGG', 'foo')
'>seqfoo\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n', '', 'GGGG')
'>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', '')
'>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n'
'''
lines = fasta.split('\n')
return lines[0]+'\n' + (up+'\n' if up else '') + '\n'.join(filter(None, lines[1:])) + '\n'\
return lines[0]+info+'\n' + (up+'\n' if up else '') + '\n'.join(filter(None, lines[1:])) + '\n'\
+ (down+'\n' if down else '')
def check_imgt_ncbi_consistency(imgt_info, imgt_data, ncbi_target, ncbi_start, ncbi_end):
......@@ -188,7 +188,7 @@ def retrieve_genes(f_name, genes, tag, additional_length, gene_list):
except KeyError:
print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord)))
min_updownstream = compute_updownstream_length(genes, additional_length)
min_updownstream = compute_updownstream_length(genes, additional_length) if additional_length else 0
print(' %s, ' % f_name + 'genes: %d, ' % len(genes) + 'up/downstream: %dbp' % min_updownstream)
# gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
......@@ -198,10 +198,17 @@ def retrieve_genes(f_name, genes, tag, additional_length, gene_list):
for info in genes:
(gene, coord) = info
if coord['imgt_name'] in CUSTOM_UPDOWNSTREAM:
ud = CUSTOM_UPDOWNSTREAM[coord['imgt_name']]
print('# Custom up/downstream for %s: %d instead of %d' % (coord['imgt_name'], ud, min_updownstream))
updownstream = ud
else:
updownstream = min_updownstream
gene_id = coord['gene_id'] if 'gene_id' in coord else None
if GENES_SEQ_FROM_NCBI:
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], min_updownstream)
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], updownstream)
else:
# IMGT
gene_data = coord['seq']
......@@ -211,11 +218,10 @@ def retrieve_genes(f_name, genes, tag, additional_length, gene_list):
if coord['imgt_name'].endswith('*01'):
check_imgt_ncbi_consistency(coord, gene_data, coord['target'], coord['target_start'],
coord['target_end'])
up_down = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'],
coord['target_end'], min_updownstream)
up_down, up_down_info = ncbi.get_updownstream_sequences(coord['target'], coord['target_start'],
coord['target_end'], updownstream)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1], '#' + up_down_info)
# post-process gene_data
if coord['imgt_data'].split('|')[-1] == FEATURE_J_REGION:
......@@ -235,6 +241,15 @@ MAX_GAP_J = 36 # maximal position of Phe/Trp (36 for TRAJ52*01)
PHE_TRP_WARN_SIZE = 15 # small sequences are on a second line
PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern'
# Upstream: negative length
# Downstream: positive length
CUSTOM_UPDOWNSTREAM = {
# see #4647
'IGHJ1P*01': 10,
'IGHJ1P*02': 10,
'IGHD7-27*01': -50,
}
CUSTOM_118 = { '': 0 # custom position of 118 in sequences without the Trp-Gly-X-Gly pattern
# 118
# |..
......
......@@ -2,5 +2,9 @@
# The awk part prints the IGHD2-2*02 sequence
!LAUNCH: (cd $VIDJIL_DIR/germline ; awk '$0 ~ /IGHD2-2.02/ {print; getline; while ($0 !~ /^>/) {print; getline}}' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct upstream information
1:NC_000014.9
1:105917009\.\.\(105916856\.\.105916826\)
$ Correct sequence, with upstream
i1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
$ Checking consistency of the germlines and special cases
!OUTPUT_FILE: ../../split-germlines.log
!NO_LAUNCHER:
!LAUNCH: echo
$ No position
33: No position
$ No Phe/Trp-Gly-X-Gly pattern
32: No Phe.Trp.Gly.X.Gly pattern
$ 118 positions
12: Custom 118 position
$ Up/down-streams
3:# Custom up/downstream
2:# Custom up/downstream .* IGHJ1P
1:# Custom up/downstream .* IGHD7-27
!OUTPUT_FILE: ../../split-germlines.err
!NO_LAUNCHER:
!LAUNCH: echo
$ Consistency IMGT/NCBI
13: Sequences for .* differ between IMGT and NCBI
11: Length for .* differ between IMGT .* and NCBI
\ No newline at end of file
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; python3 map-germlines.py)
$ No overlap
0:Overlap .* between
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment