ncbi.py 4.21 KB
Newer Older
1 2 3 4

import urllib
import sys
import re
5
import os
6
import fasta
7 8 9

API_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'

10
API_EUTILS += 'api_key='+os.environ['NCBI_KEY']+'&' if 'NCBI_KEY' in os.environ else ''
11 12 13 14

API_NUCCORE_ID =         API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s'
API_NUCCORE_ID_FROM_TO = API_EUTILS + 'db=nuccore&rettype=fasta&retmode=text' + '&id=%s' + '&from=%s&to=%s'

15 16 17 18
API_GENE_ID_XML = API_EUTILS + 'db=gene&retmode=xml&rettype=docsum' + '&id=%s'


from xml.dom import minidom, Node
19 20 21



Mathieu Giraud's avatar
Mathieu Giraud committed
22 23
# The two following functions should be refactored as one (used in split-from-imgt and get-CD)

24
def get_gene_sequence(gene, other_gene_name, start, end, additional_length):
25 26 27
    '''
    Return the gene sequences between positions start and end (included).
    '''
28 29 30 31 32
    if additional_length > 0:
        end += additional_length
    elif additional_length < 0:
        start = max(1, start + additional_length)

33 34 35 36 37 38 39 40 41 42 43
    fasta_string = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()
    return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)

def ncbi_and_write(ncbi, additional_header, outs):
    print ncbi, additional_header
    fasta = urllib.urlopen(API_NUCCORE_ID % ncbi).read()
    fasta_with_id = fasta.replace('>', '>' + additional_header)
    
    for out in outs:
        out.write(fasta_with_id)

44
def get_updownstream_sequences(gene, start, end, additional_length):
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
    '''
     Only returns upstream or downstream raw sequences

    :param gene: accession number where the sequence of interest is
    :param start, end: start and end positions in the sequence of
                       interest of our gene of interest. These are not
                       the positions we want to recover. We want to recover
                       positions that are either upstream or downstream.
                       Note that when the gene of interest is on the reverse
                       strand, we have start > end.
    :param additional_length: length of the upstream of downstream region to
                       recover. When additional_length > 0 we get the downstream
                       region, and conversely when additional_length < 0.
    :return: A tuple whose first element is the upstream region (or empty)
             and where the second element is the downstream region (or empty
    '''

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
    if additional_length == 0:
        return ('', '')
    reversed = -1 if (end < start) else 1
    if additional_length > 0:
        start = end + 1 * reversed
        end = end + additional_length * reversed
    elif additional_length < 0:
        end = start - 1 * reversed
        start = max(1, start + additional_length * reversed)

    if start > end:
        tmp = start
        start = end
        end = tmp

    updown_fasta = urllib.urlopen(API_NUCCORE_ID_FROM_TO % (gene, start, end)).read()

    updown_raw = '\n'.join(updown_fasta.split('\n')[1:]).strip()
    if reversed == -1:
        updown_raw = fasta.revcomp(updown_raw.upper())

    if additional_length > 0:
        return ('', updown_raw)
    else:
        return (updown_raw, '')
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103


# Parse output from API_GENE_ID_XML to get genomic positions of a gene

def xml_bang_one(parent):
    return {node.nodeName: node.firstChild.nodeValue for node in parent.childNodes if node.nodeType == Node.ELEMENT_NODE}

def get_last_LocationHistType(gene):
    '''
    >>> get_last_LocationHistType(6969)
    {u'AssemblyAccVer': u'GCF_000001405.38', u'ChrAccVer': u'NC_000007.14', u'AnnotationRelease': u'109', u'ChrStop': u'38253379', u'ChrStart': u'38253428'}
    '''

    sys.stderr.write('%% eutils -> gene %s' % gene + '\n')
    xml = minidom.parseString(urllib.urlopen(API_GENE_ID_XML % gene).read())

    locations = xml.getElementsByTagName('LocationHistType')
104 105 106 107 108

    if locations:
        return xml_bang_one(locations[0])
    else:
        raise KeyError, gene
109 110 111 112 113

def get_gene_positions(gene):
    '''
    >>> get_gene_positions(6969)
    (u'NC_000007.14', 38253428, 38253379)
114 115 116 117 118

    >>> get_gene_positions('zoycooxz')
    Traceback (most recent call last):
      ...
    KeyError: 'zoycooxz'
119 120 121 122 123
    '''

    loc = get_last_LocationHistType(gene)

    chr = loc['ChrAccVer']
124
    start, stop = int(loc['ChrStart'])+1, int(loc['ChrStop'])+1
125 126

    return chr, start, stop