split-from-imgt.py 6.03 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Mikaël Salson's avatar
Mikaël Salson committed
3 4 5


import sys
6
import os
7 8 9
import urllib
from collections import defaultdict
import re
10 11 12 13 14 15 16 17 18 19

IMGT_LICENSE = '''
   # To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license: 
   # academic research only, provided that it is referred to IMGT®,
   # and cited as "IMGT®, the international ImMunoGeneTics information system® 
   # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). 
   # Lefranc, M.-P., IMGT®, the international ImMunoGeneTics database,
   # Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''

20
print (IMGT_LICENSE)
21

22
NCBI_API = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
23

Mathieu Giraud's avatar
Mathieu Giraud committed
24
# Parse lines in IMGT/GENE-DB such as:
Mikaël Salson's avatar
Mikaël Salson committed
25 26 27 28 29
# >M12949|TRGV1*01|Homo sapiens|ORF|...

open_files = {}
current_file = None

30
def verbose_open_w(name):
31
    print (" ==> %s" % name)
32 33
    return open(name, 'w')

34 35 36 37 38 39 40 41 42 43
def get_split_files(seq, split_seq):
    for s_seq in split_seq.keys():
        if seq.find(s_seq) > -1:
            return split_seq[s_seq]
    return []

def check_directory_exists(path):
    if not(os.path.isdir(path)):
        os.mkdir(path)

44 45 46
def gene_matches(string, list_regex):
    '''
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'IGHD'])
47
    ['IGHD']
48
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'TRGD'])
49
    []
50
    >>> gene_matches('>M994641|IGHJ4*01|Toto', ['[A-Z]{3}J'])
51 52 53
    ['IGHJ']
    >>> gene_matches('>M22153|TRDD2*01|Homo sapiens|F|', ['TRDD', 'IGH', 'TRDD2'])
    ['TRDD', 'TRDD2']
54
    '''
55
    results = []
56
    for regex in list_regex:
57 58
        match = re.search(regex, string)
        if match <> None:
59 60
            results.append(match.group(0))
    return results
61 62 63 64

def get_gene_coord(imgt_line):
    '''
    >>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
65 66 67
    >>> get_gene_coord(line)[0] == 'X15272'
    True
    >>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_name': 'TRGV4*01'}
68 69 70 71
    True
    '''
    elements = imgt_line.split('|')
    assert len(elements) >= 6
72 73
    if elements[5].find('..') == -1:
        return None, None
74
    start, end = elements[5].split('..')
75 76 77
    return elements[0][1:], {'from': int(start),
                             'to': int(end),
                             'imgt_name': elements[1]}
78 79 80 81 82 83 84 85

def get_gene_sequence(gene, other_gene_name, start, end):
    '''
    Return the gene sequences between positions start and end (included).
    '''
    fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
    return re.sub('(>g.\|)', r'\1'+other_gene_name+'|', fasta_string)

86
def store_data_if_updownstream(fasta_header, path, data, genes):
87
    for gene in gene_matches(fasta_header, genes):
88
        gene_name, gene_coord = get_gene_coord(fasta_header)
89 90
        if gene_name:
            data[path+'/'+gene][gene_name].append(gene_coord)
91
    
92 93 94
def retrieve_genes(filename, genes, additional_length):
    file = verbose_open_w(filename)
    for gene in genes:
95 96 97 98 99 100 101 102
        for coord in genes[gene]:
            start = coord['from']
            end = coord['to']
            if additional_length > 0:
                end += additional_length
            elif additional_length < 0:
                start = max(1, start + additional_length)
            file.write(get_gene_sequence(gene, coord['imgt_name'], start, end))
103 104 105 106


LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
107 108 109 110
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]

111 112 113 114 115
FEATURES_VDJ = [ "V-REGION", "D-REGION", "J-REGION" ]
FEATURES_CLASSES = [
    "CH1", "CH2", "CH3", "CH3-CHS", "CH4-CHS",
    "H", "H-CH2", "H1", "H2", "H3", "H4",
    "M", "M1", "M2",
116
]
117 118 119 120 121
FEATURES = FEATURES_VDJ + FEATURES_CLASSES

# Heavy-chain human IGH exons, ordered
CLASSES = [ "IGHM", "IGHD", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2",
            "IGHGP" ]
122

123 124 125
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}

126
DOWNSTREAM_REGIONS=['[A-Z]{3}J', 'TRDD3']
Mikaël Salson's avatar
Mikaël Salson committed
127
UPSTREAM_REGIONS=['IGHD', 'TRDD', 'TRBD', 'TRDD2']
128

129 130 131
SPECIES = {
    "Homo sapiens": './', 
    "Mus musculus": 'mus-musculus/',
132 133
    "Rattus norvegicus": 'rattus-norvegicus/',
    "Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
134 135
}

136 137
downstream_data = defaultdict(lambda: defaultdict(list))
upstream_data = defaultdict(lambda: defaultdict(list))
138

Mikaël Salson's avatar
Mikaël Salson committed
139 140 141
for l in sys.stdin:

    if ">" in l:
142
        current_files = []
143 144
        current_special = None

145
        species = l.split('|')[2].strip()
146
        feature = l.split('|')[4].strip()
147

148
        if species in SPECIES and feature in FEATURES:
149
            seq = l.split('|')[1]
150
            path = SPECIES[species]
151 152 153 154 155 156 157 158

            if feature in FEATURES_VDJ:
                system = seq[:4]
            else:
                system = seq[:seq.find("*")]
                if not system in CLASSES:
                    print "! Unknown class: ", system

159 160 161
            keys = [path + system]

            check_directory_exists(path)
162

Mikaël Salson's avatar
Mikaël Salson committed
163 164
            if system.startswith('IG') or system.startswith('TR'):

165 166
                store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
                store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
167

168 169 170 171 172 173 174 175
                systems = get_split_files(seq, SPLIT_SEQUENCES)
                if systems:
                    keys = [path + s for s in systems]
                for key in keys:
                    if not (key in open_files):
                        name = '%s.fa' % (key)
                        open_files[key] = verbose_open_w(name)
                    current_files.append(open_files[key])
Mikaël Salson's avatar
Mikaël Salson committed
176

177 178
            if seq in SPECIAL_SEQUENCES:
                name = '%s.fa' % seq.replace('*', '-')
179
                current_special = verbose_open_w(name)
180

Mikaël Salson's avatar
Mikaël Salson committed
181

182
    for current_file in current_files:
Mikaël Salson's avatar
Mikaël Salson committed
183 184
            current_file.write(l)

185 186
    if current_special:
            current_special.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
187

188 189 190 191
for system in upstream_data:
    retrieve_genes(system+"_upstream.fa", upstream_data[system], -LENGTH_UPSTREAM)
for system in downstream_data:
    retrieve_genes(system+"_downstream.fa", downstream_data[system], LENGTH_DOWNSTREAM)