Commit 8537dc89 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-g/3009-gene-locations' into 'dev'

Half-time merge, closes #3009.
See !182.
parents 65a03851 b18c2caf
Pipeline #32566 passed with stages
in 49 seconds
......@@ -45,7 +45,7 @@ test_germlines:
stage: test_germlines
script:
- make -C germline get-all-data
- make -C germline/tests
- make -C germline tests
only:
- /^feature-.*g.*\/.*$/
......
......@@ -30,7 +30,12 @@ diff-from-saved:
echo
diff -r -u -x "*[.][^f][^a]" -x "germline*" -x "get*" -x "Makefile" -x "saved-*" saved-germline/ .
tests:
python split-from-imgt.py --test
make -C tests
distrib: get-all-data js
cd .. ; tar cvzf germline-`cat germline/germline_id`.tar.gz germline/germline_id germline/*/*.fa germline/IMGT_RELEASE browser/js/germline.js
.PHONY: all germline js get-all-data clean diff-from-saved
.PHONY: all germline js get-all-data clean diff-from-saved tests
......@@ -5,6 +5,8 @@
import urllib
from ncbi import *
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
......@@ -14,7 +16,6 @@ HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd
HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
......@@ -30,16 +31,6 @@ print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')
def ncbi_and_write(ncbi, hugo, cd_id, outs):
print cd_id, hugo, ncbi
fasta = urllib.urlopen(NCBI_API % ncbi).read()
fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))
for out in outs:
out.write(fasta_with_id)
for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
ll = l.split('\t')
......@@ -51,7 +42,9 @@ for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
print "!", l
continue
ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))
ncbi_and_write(ncbi,
'%s|%s|' % (hugo, cd_id),
[out] + ([sorting_out] if cd_id in SORTING_CD else []))
#!/bin/sh
cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP | python split-from-imgt.py
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-GeneList
wget http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP
python split-from-imgt.py IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP IMGTGENEDB-GeneList
wget -O IMGT_RELEASE http://www.imgt.org/download/GENE-DB/RELEASE
......
import urllib
import sys
import re
import os
import fasta
API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
API_EUTILS += 'api_key='+os.environ['NCBI_KEY']+'&' if 'NCBI_KEY' in os.environ else ''
API_NUCCORE_ID = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s'
API_GENE_ID_XML = API_EUTILS + 'db=gene&retmode=xml&rettype=docsum' + '&id=%s'
from xml.dom import minidom, Node
# The two following functions should be refactored as one (used in split-from-imgt and get-CD)
def get_gene_sequence(gene, other_gene_name, start, end, additional_length):
'''
Return the gene sequences between positions start and end (included).
'''
if additional_length > 0:
end += additional_length
elif additional_length < 0:
start = max(1, start + additional_length)
fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
def ncbi_and_write(ncbi, additional_header, outs):
print ncbi, additional_header
fasta = urllib.urlopen(API_NUCCORE_ID % ncbi).read()
fasta_with_id = fasta.replace('>', '>' + additional_header)
for out in outs:
out.write(fasta_with_id)
def get_updownstream_sequences(gene, other_gene_name, start, end, additional_length):
#Only returns upstream or downstream raw sequences
if additional_length == 0:
return ('', '')
reversed = -1 if (end < start) else 1
if additional_length > 0:
start = end + 1 * reversed
end = end + additional_length * reversed
elif additional_length < 0:
end = start - 1 * reversed
start = max(1, start + additional_length * reversed)
if start > end:
tmp = start
start = end
end = tmp
updown_fasta = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip()
if reversed == -1:
updown_raw = fasta.revcomp(updown_raw.upper())
if additional_length > 0:
return ('', updown_raw)
else:
return (updown_raw, '')
# Parse output from API_GENE_ID_XML to get genomic positions of a gene
def xml_bang_one(parent):
return {node.nodeName: node.firstChild.nodeValue for node in parent.childNodes if node.nodeType == Node.ELEMENT_NODE}
def get_last_LocationHistType(gene):
'''
>>> get_last_LocationHistType(6969)
{u'AssemblyAccVer': u'GCF_000001405.38', u'ChrAccVer': u'NC_000007.14', u'AnnotationRelease': u'109', u'ChrStop': u'38253379', u'ChrStart': u'38253428'}
'''
sys.stderr.write('%% eutils -> gene %s' % gene + '\n')
xml = minidom.parseString(urllib.urlopen(API_GENE_ID_XML % gene).read())
locations = xml.getElementsByTagName('LocationHistType')
if locations:
return xml_bang_one(locations[0])
else:
raise KeyError, gene
def get_gene_positions(gene):
'''
>>> get_gene_positions(6969)
(u'NC_000007.14', 38253428, 38253379)
>>> get_gene_positions('zoycooxz')
Traceback (most recent call last):
...
KeyError: 'zoycooxz'
'''
loc = get_last_LocationHistType(gene)
chr = loc['ChrAccVer']
start, stop = int(loc['ChrStart'])+1, int(loc['ChrStop'])+1
return chr, start, stop
This diff is collapsed.
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; grep -A2 -F 'IGHD2-2*02' homo-sapiens/IGHD+up.fa | tr -d '\n')
$ Correct sequence, with upstream
1:AGGATTTTGTGGGGGCTCGTGTCACTGTGA
!NO_LAUNCHER:
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#' )
!LAUNCH: (cd $VIDJIL_DIR/germline ; cat homo-sapiens/TRDD2+up.fa | tr '|' '#' | tr -d '\n')
$ Correct full header, with TRDD2*01 identifier between pipes
1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
f1: .*#TRDD2.01#.*Human T-cell receptor germline delta-chain D-region DNA
$ Correct sequence, with upstream
1: AAGAGGGTTTTTATACTGATGTGTTTCATTGTGCCTTCCTAC
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment