Commit 8537dc89 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-g/3009-gene-locations' into 'dev'

Half-time merge, closes #3009.
See !182.
parents 65a03851 b18c2caf
Pipeline #32566 passed with stages
in 49 seconds
......@@ -45,7 +45,7 @@ test_germlines:
stage: test_germlines
script:
- make -C germline get-all-data
- make -C germline/tests
- make -C germline tests
only:
- /^feature-.*g.*\/.*$/
......
......@@ -30,7 +30,12 @@ diff-from-saved:
echo
diff -r -u -x "*[.][^f][^a]" -x "germline*" -x "get*" -x "Makefile" -x "saved-*" saved-germline/ .
tests:
python split-from-imgt.py --test
make -C tests
distrib: get-all-data js
cd .. ; tar cvzf germline-`cat germline/germline_id`.tar.gz germline/germline_id germline/*/*.fa germline/IMGT_RELEASE browser/js/germline.js
.PHONY: all germline js get-all-data clean diff-from-saved
.PHONY: all germline js get-all-data clean diff-from-saved tests
......@@ -5,6 +5,8 @@
import urllib
from ncbi import *
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
......@@ -14,7 +16,6 @@ HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd
HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
......@@ -30,16 +31,6 @@ print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')
def ncbi_and_write(ncbi, hugo, cd_id, outs):
print cd_id, hugo, ncbi
fasta = urllib.urlopen(NCBI_API % ncbi).read()
fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))
for out in outs:
out.write(fasta_with_id)
for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
ll = l.split('\t')
......@@ -51,7 +42,9 @@ for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
print "!", l
continue
ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))
ncbi_and_write(ncbi,
'%s|%s|' % (hugo, cd_id),
[out] + ([sorting_out] if cd_id in SORTING_CD else []))
#!/bin/sh
cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP | python split-from-imgt.py
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
......
import urllib
import sys
import re
import os
import fasta
API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
API_EUTILS += 'api_key='+os.environ['NCBI_KEY']+'&' if 'NCBI_KEY' in os.environ else ''
API_NUCCORE_ID = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s'
API_GENE_ID_XML = API_EUTILS + 'db=gene&retmode=xml&rettype=docsum' + '&id=%s'
from xml.dom import minidom, Node
# The two following functions should be refactored as one (used in split-from-imgt and get-CD)
def get_gene_sequence(gene, other_gene_name, start, end, additional_length):
'''
Return the gene sequences between positions start and end (included).
'''
if additional_length > 0:
end += additional_length
elif additional_length < 0:
start = max(1, start + additional_length)
fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
def ncbi_and_write(ncbi, additional_header, outs):
print ncbi, additional_header
fasta = urllib.urlopen(API_NUCCORE_ID % ncbi).read()
fasta_with_id = fasta.replace('>', '>' + additional_header)
for out in outs:
out.write(fasta_with_id)
def get_updownstream_sequences(gene, other_gene_name, start, end, additional_length):
#Only returns upstream or downstream raw sequences
if additional_length == 0:
return ('', '')
reversed = -1 if (end < start) else 1
if additional_length > 0:
start = end + 1 * reversed
end = end + additional_length * reversed
elif additional_length < 0:
end = start - 1 * reversed
start = max(1, start + additional_length * reversed)
if start > end:
tmp = start
start = end
end = tmp
updown_fasta = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip()
if reversed == -1:
updown_raw = fasta.revcomp(updown_raw.upper())
if additional_length > 0:
return ('', updown_raw)
else:
return (updown_raw, '')
# Parse output from API_GENE_ID_XML to get genomic positions of a gene
def xml_bang_one(parent):
return {node.nodeName: node.firstChild.nodeValue for node in parent.childNodes if node.nodeType == Node.ELEMENT_NODE}
def get_last_LocationHistType(gene):
'''
>>> get_last_LocationHistType(6969)
{u'AssemblyAccVer': u'GCF_000001405.38', u'ChrAccVer': u'NC_000007.14', u'AnnotationRelease': u'109', u'ChrStop': u'38253379', u'ChrStart': u'38253428'}
'''
sys.stderr.write('%% eutils -> gene %s' % gene + '\n')
xml = minidom.parseString(urllib.urlopen(API_GENE_ID_XML % gene).read())
locations = xml.getElementsByTagName('LocationHistType')
if locations:
return xml_bang_one(locations[0])
else:
raise KeyError, gene
def get_gene_positions(gene):
'''
>>> get_gene_positions(6969)
(u'NC_000007.14', 38253428, 38253379)
>>> get_gene_positions('zoycooxz')
Traceback (most recent call last):
...
KeyError: 'zoycooxz'
'''
loc = get_last_LocationHistType(gene)
chr = loc['ChrAccVer']
start, stop = int(loc['ChrStart'])+1, int(loc['ChrStop'])+1
return chr, start, stop
......@@ -8,6 +8,8 @@ import urllib
from collections import defaultdict, OrderedDict
import re
import ncbi
IMGT_LICENSE = '''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
......@@ -17,9 +19,8 @@ IMGT_LICENSE = '''
# Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''
print (IMGT_LICENSE)
NCBI_API = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
def remove_allele(name):
return name.split('*')[0]
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
......@@ -74,7 +75,7 @@ def get_gene_coord(imgt_line):
>>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
>>> get_gene_coord(line)[0] == 'X15272'
True
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01'}
>>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01', 'species': 'Homo sapiens'}
True
'''
elements = imgt_line.split('|')
......@@ -88,32 +89,59 @@ def get_gene_coord(imgt_line):
end = end.split(',')[0]
return elements[0][1:], {'from': int(start),
'to': int(end),
'species': elements[2],
'imgt_name': elements[1],
'imgt_data': '|'.join(elements[1:5])}
def get_gene_sequence(gene, other_gene_name, start, end):
def paste_updown_on_fasta(fasta, up, down):
'''
Return the gene sequences between positions start and end (included).
Put upstream and/or downstream data on an existing FASTA sequences
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', 'GGGG')
'>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n', '', 'GGGG')
'>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
>>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', '')
'>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n'
'''
fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
lines = fasta.split('\n')
return lines[0]+'\n' + (up+'\n' if up else '') + '\n'.join(filter(None, lines[1:])) + '\n'\
+ (down+'\n' if down else '')
def store_data_if_updownstream(fasta_header, path, data, genes):
for gene in gene_matches(fasta_header, genes):
gene_name, gene_coord = get_gene_coord(fasta_header)
if gene_name:
data[path+'/'+gene][gene_name].append(gene_coord)
def retrieve_genes(f, genes, tag, additional_length):
def retrieve_genes(f, genes, tag, additional_length, gene_list):
for gene in genes:
for coord in genes[gene]:
start = coord['from']
end = coord['to']
if additional_length > 0:
end += additional_length
elif additional_length < 0:
start = max(1, start + additional_length)
gene_data = get_gene_sequence(gene, coord['imgt_data'] + tag, start, end)
# try to extract from genome
gene_id = gene_list.get_gene_id_from_imgt_name(coord['species'], coord['imgt_name'])
allele_additional_length = 0
if gene_id:
try:
(target, start, end) = ncbi.get_gene_positions(gene_id)
print(coord, gene_id, target, start, end)
except KeyError:
print('! No positions for %s (%s: %s)' % (gene_id, gene, str(genes[gene])))
allele_additional_length = additional_length
gene_id = None
# extract from gene
gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length)
if gene_id:
up_down = ncbi.get_updownstream_sequences(target, coord['imgt_data'] + tag, start, end, additional_length)
# We put the up and downstream data before and after the sequence we retrieved previously
gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
# post-process gene_data
if coord['imgt_data'].split('|')[-1] == FEATURE_J_REGION:
gene_lines = gene_data.split('\n')
gene_lines[1] = gap_j(gene_lines[1].lower())
......@@ -161,7 +189,7 @@ def gap_j(seq):
pos = CUSTOM_118[custom_seq]
if pos is None:
m = j118.search(seq)
m = j118.search(seq, re.IGNORECASE)
if not m:
if len(seq) > PHE_TRP_WARN_SIZE:
......@@ -220,75 +248,122 @@ class OrderedDefaultListDict(OrderedDict):
self[key] = value = []
return value
downstream_data = defaultdict(lambda: OrderedDefaultListDict())
upstream_data = defaultdict(lambda: OrderedDefaultListDict())
for l in sys.stdin:
# New sequence: compute 'current_files' and stores up/downstream_data[]
class IMGTGENEDBGeneList():
'''
Parse lines such as
'Homo sapiens;TRGJ2;F;Homo sapiens T cell receptor gamma joining 2;1;7;7p14;M12961;6969;'
>>> gl = IMGTGENEDBGeneList('IMGTGENEDB-GeneList')
>>> gl.get_gene_id_from_imgt_name('Homo sapiens', 'TRGJ2*01')
'6969'
'''
def __init__(self, f):
self.data = defaultdict(str)
for l in open(f):
ll = l.split(';')
species, name, gene_id = ll[0], ll[1], ll[-2]
self.data[species, name] = gene_id
if ">" in l:
current_files = []
current_special = None
def get_gene_id_from_imgt_name(self, species, name):
return self.data[species, remove_allele(name)]
species = l.split('|')[2].strip()
feature = l.split('|')[4].strip()
if species in SPECIES and feature in FEATURES:
seq = l.split('|')[1]
path = SPECIES[species]
if feature in FEATURES_VDJ:
system = seq[:4]
else:
system = seq[:seq.find("*")]
if not system in CLASSES:
print "! Unknown class: ", system
system = system.replace("IGH", "IGHC=")
def split_IMGTGENEDBReferenceSequences(f, gene_list):
keys = [path + system]
downstream_data = defaultdict(lambda: OrderedDefaultListDict())
upstream_data = defaultdict(lambda: OrderedDefaultListDict())
check_directory_exists(path)
for l in open(ReferenceSequences):
if system.startswith('IG') or system.startswith('TR'):
# New sequence: compute 'current_files' and stores up/downstream_data[]
if ">" in l:
current_files = []
current_special = None
species = l.split('|')[2].strip()
feature = l.split('|')[4].strip()
if species in SPECIES and feature in FEATURES:
seq = l.split('|')[1]
path = SPECIES[species]
if feature in FEATURES_VDJ:
store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
system = seq[:4]
else:
system = seq[:seq.find("*")]
if not system in CLASSES:
print "! Unknown class: ", system
system = system.replace("IGH", "IGHC=")
keys = [path + system]
check_directory_exists(path)
if system.startswith('IG') or system.startswith('TR'):
if feature in FEATURES_VDJ:
store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
systems = get_split_files(seq, SPLIT_SEQUENCES)
if systems:
keys = [path + s for s in systems]
for key in keys:
current_files.append(open_files[key])
systems = get_split_files(seq, SPLIT_SEQUENCES)
if systems:
keys = [path + s for s in systems]
for key in keys:
current_files.append(open_files[key])
if seq in SPECIAL_SEQUENCES:
name = '%s.fa' % seq.replace('*', '-')
current_special = verbose_open_w(name)
if seq in SPECIAL_SEQUENCES:
name = '%s.fa' % seq.replace('*', '-')
current_special = verbose_open_w(name)
# Possibly gap J_REGION
# Possibly gap J_REGION
if '>' not in l and current_files and feature == FEATURE_J_REGION:
l = gap_j(l)
if '>' not in l and current_files and feature == FEATURE_J_REGION:
l = gap_j(l)
# Dump 'l' to the concerned files
# Dump 'l' to the concerned files
for current_file in current_files:
current_file.write(l)
for current_file in current_files:
current_file.write(l)
if current_special:
current_special.write(l)
if current_special:
current_special.write(l)
# End, loop to next 'l'
# End, loop to next 'l'
# Dump up/downstream data
# Dump up/downstream data
for system in upstream_data:
f = verbose_open_w(system + TAG_UPSTREAM + '.fa')
retrieve_genes(f, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list)
for system in upstream_data:
f = verbose_open_w(system + TAG_UPSTREAM + '.fa')
retrieve_genes(f, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM)
for system in downstream_data:
f = verbose_open_w(system + TAG_DOWNSTREAM + '.fa')
retrieve_genes(f, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list)
for system in downstream_data:
f = verbose_open_w(system + TAG_DOWNSTREAM + '.fa')
retrieve_genes(f, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM)
if __name__ == '__main__':
if sys.argv[1] == '--test':
import doctest
doctest.testmod()
else:
print (IMGT_LICENSE)
ReferenceSequences = sys.argv[1]
GeneList = sys.argv[2]
gl = IMGTGENEDBGeneList(GeneList)
split_IMGTGENEDBReferenceSequences(ReferenceSequences, gl)
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#' )
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#' | tr -d '\n')
$ Correct full header, with TRDD2*01 identifier between pipes
1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
f1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
$ Correct sequence, with upstream
1: AAGAGGGTTTTTATACTGATGTGTTTCATTGTGCCTTCCTAC
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment