Commit ab308abe authored by Mathieu Giraud's avatar Mathieu Giraud

ncbi.py: extract and refactor from split-from-imgt.py and get-CD.py

parent 2771e711
......@@ -5,6 +5,8 @@
import urllib
from ncbi import *
HUGO_REQUEST = 'http://www.genenames.org/cgi-bin/download?'
HUGO_COLS = '&col=gd_hgnc_id&col=md_refseq_id&col=gd_other_ids_list&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids'
......@@ -14,7 +16,6 @@ HUGO_QUERY_HCDM = '&status=Approved&status=Entry+Withdrawn&status_opt=2&where=gd
HUGO_URL_HCDM = HUGO_REQUEST + HUGO_COLS + HUGO_QUERY_HCDM
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s'
# Common CD used to sort cells, see https://en.wikipedia.org/wiki/Cluster_of_differentiation
SORTING_CD = [ 'CD3g', 'CD3d', 'CD3e', 'CD4', 'CD8a', 'CD8b', 'CD11a', 'CD11b', 'CD14', 'CD15', 'CD16a', 'CD16b', 'CD19', 'CD20', 'CD22', 'CD24', 'CD25', 'CD30', 'CD31', 'CD34', 'CD38', 'CD45', 'CD56', 'CD61', 'CD91', 'CD114', 'CD117', 'CD182' ]
......@@ -30,16 +31,6 @@ print "==>", SORTING_OUT
sorting_out = open(SORTING_OUT, 'w')
def ncbi_and_write(ncbi, hugo, cd_id, outs):
print cd_id, hugo, ncbi
fasta = urllib.urlopen(NCBI_API % ncbi).read()
fasta_with_id = fasta.replace('>', '>%s|%s|' % (hugo, cd_id))
for out in outs:
out.write(fasta_with_id)
for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
ll = l.split('\t')
......@@ -51,7 +42,9 @@ for l in urllib.urlopen(HUGO_URL_HCDM).readlines():
print "!", l
continue
ncbi_and_write(ncbi, hugo, cd_id, [out] + ([sorting_out] if cd_id in SORTING_CD else []))
ncbi_and_write(ncbi,
'%s|%s|' % (hugo, cd_id),
[out] + ([sorting_out] if cd_id in SORTING_CD else []))
import urllib
import sys
import re
API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
API_NUCCORE_ID = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s'
def get_gene_sequence(gene, other_gene_name, start, end):
'''
Return the gene sequences between positions start and end (included).
'''
fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
def ncbi_and_write(ncbi, additional_header, outs):
print ncbi, additional_header
fasta = urllib.urlopen(API_NUCCORE_ID % ncbi).read()
fasta_with_id = fasta.replace('>', '>' + additional_header)
for out in outs:
out.write(fasta_with_id)
......@@ -8,6 +8,8 @@ import urllib
from collections import defaultdict, OrderedDict
import re
from ncbi import *
IMGT_LICENSE = '''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
......@@ -19,8 +21,6 @@ IMGT_LICENSE = '''
print (IMGT_LICENSE)
NCBI_API = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
......@@ -91,12 +91,6 @@ def get_gene_coord(imgt_line):
'imgt_name': elements[1],
'imgt_data': '|'.join(elements[1:5])}
def get_gene_sequence(gene, other_gene_name, start, end):
'''
Return the gene sequences between positions start and end (included).
'''
fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
def store_data_if_updownstream(fasta_header, path, data, genes):
for gene in gene_matches(fasta_header, genes):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment