split-from-imgt.py 8.49 KB
Newer Older
1
2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Mikaël Salson's avatar
Mikaël Salson committed
3
4
5


import sys
6
import os
7
8
9
import urllib
from collections import defaultdict
import re
10
11
12
13
14
15
16
17
18
19

IMGT_LICENSE = '''
   # To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license: 
   # academic research only, provided that it is referred to IMGT®,
   # and cited as "IMGT®, the international ImMunoGeneTics information system® 
   # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). 
   # Lefranc, M.-P., IMGT®, the international ImMunoGeneTics database,
   # Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''

20
print (IMGT_LICENSE)
21

22
NCBI_API = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
23

Mathieu Giraud's avatar
Mathieu Giraud committed
24
# Parse lines in IMGT/GENE-DB such as:
Mikaël Salson's avatar
Mikaël Salson committed
25
26
27
28
29
# >M12949|TRGV1*01|Homo sapiens|ORF|...

open_files = {}
current_file = None

30
def verbose_open_w(name):
31
    print (" ==> %s" % name)
32
33
    return open(name, 'w')

34
35
36
37
38
39
40
41
42
43
def get_split_files(seq, split_seq):
    for s_seq in split_seq.keys():
        if seq.find(s_seq) > -1:
            return split_seq[s_seq]
    return []

def check_directory_exists(path):
    if not(os.path.isdir(path)):
        os.mkdir(path)

44
45
46
def gene_matches(string, list_regex):
    '''
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'IGHD'])
47
    ['IGHD']
48
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'TRGD'])
49
    []
50
    >>> gene_matches('>M994641|IGHJ4*01|Toto', ['[A-Z]{3}J'])
51
52
53
    ['IGHJ']
    >>> gene_matches('>M22153|TRDD2*01|Homo sapiens|F|', ['TRDD', 'IGH', 'TRDD2'])
    ['TRDD', 'TRDD2']
54
    '''
55
    results = []
56
    for regex in list_regex:
57
58
        match = re.search(regex, string)
        if match <> None:
59
60
            results.append(match.group(0))
    return results
61
62
63
64

def get_gene_coord(imgt_line):
    '''
    >>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
65
66
67
    >>> get_gene_coord(line)[0] == 'X15272'
    True
    >>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_name': 'TRGV4*01'}
68
69
70
71
    True
    '''
    elements = imgt_line.split('|')
    assert len(elements) >= 6
72
73
    if elements[5].find('..') == -1:
        return None, None
74
    start, end = elements[5].split('..')
75
76
    if start.find(',') > -1:
        start = start[2:]
77
78
    if end.find(',') > -1:
        end = end.split(',')[0]
79
80
81
    return elements[0][1:], {'from': int(start),
                             'to': int(end),
                             'imgt_name': elements[1]}
82
83
84
85
86
87

def get_gene_sequence(gene, other_gene_name, start, end):
    '''
    Return the gene sequences between positions start and end (included).
    '''
    fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
88
    return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
89

90
def store_data_if_updownstream(fasta_header, path, data, genes):
91
    for gene in gene_matches(fasta_header, genes):
92
        gene_name, gene_coord = get_gene_coord(fasta_header)
93
94
        if gene_name:
            data[path+'/'+gene][gene_name].append(gene_coord)
95
    
96
def retrieve_genes(filename, genes, additional_length):
97
    f = verbose_open_w(filename)
98
    for gene in genes:
99
100
101
102
103
104
105
        for coord in genes[gene]:
            start = coord['from']
            end = coord['to']
            if additional_length > 0:
                end += additional_length
            elif additional_length < 0:
                start = max(1, start + additional_length)
106
            f.write(get_gene_sequence(gene, coord['imgt_name'], start, end))
107
108


109
110
111
112
113
114
115
116
117
#                  Phe
#                  TrpGly   Gly
j118 = re.compile('t..gg....gg.')


MAX_GAP_J = 36          # maximal position of Phe/Trp (36 for TRAJ52*01)
PHE_TRP_WARN_SIZE = 15  # small sequences are on a second line
PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern'

118
119
120
121
122
123
124
125
126
CUSTOM_118 = { '': 0    # custom position of 118 in sequences without the Trp-Gly-X-Gly pattern
    #                               118
    #                               |..
    ,                       'gcacatgtttggcagcaagacccagcccactgtctta':         8 # IGLJ-C/OR18*01
    ,    'ggttttcagatggccagaagctgctctttgcaaggggaaccatgttaaaggtggatctta':    27 # TRAJ16*01
    , 'agatgcgtgacagctatgagaagctgatatttggaaaggagacatgactaactgtgaagc':       30 # TRAJ51*01
    ,    'ggtaccgggttaataggaaactgacatttggagccaacactagaggaatcatgaaactca':    27 # TRAJ61*01
    ,       'ataccactggttggttcaagatatttgctgaagggactaagctcatagtaacttcacctg': 24 # TRGJP1*01
    ,       'atagtagtgattggatcaagacgtttgcaaaagggactaggctcatagtaacttcgcctg': 24 # TRGJP2*01
127
128
  #                                 t..gg....gg.                               # Regexp
  #  .....ggaaggaaggaaacaggaaatttacatttggaatggggacgcaagtgagagtga               # TRAJ59*01 (ok)
129
130
  #  ,         'ctgagaggcgctgctgggcgtctgggcggaggactcctggttctgg':               # TRBJ2-2P*01 ?
  #  ,        'ctcctacgagcagtacgtcgggccgggcaccaggctcacggtcacag':               # TRBJ2-7*02 ?
131
132
}

133
134
135
def gap_j(seq):
    '''Gap J sequences in order to align the Phe118/Trp118 codon'''

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    seqs = seq.strip()

    if seqs in CUSTOM_118:
        print "# Custom 118 position in %s" % seq
        pos = CUSTOM_118[seqs]

    else:
        m = j118.search(seq)

        if not m:
            if len(seq) > PHE_TRP_WARN_SIZE:
                print "# %s in %s" % (PHE_TRP_WARN_MSG, seq)
                seq = "# %s\n%s" % (PHE_TRP_WARN_MSG, seq)
            return seq

        pos = m.start() + 1 # positions start at 1
152
153
154
155

    return (MAX_GAP_J - pos) * '.' + seq


156
157
LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
158
159
160
161
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]

162
163
164
165
166
FEATURES_VDJ = [ "V-REGION", "D-REGION", "J-REGION" ]
FEATURES_CLASSES = [
    "CH1", "CH2", "CH3", "CH3-CHS", "CH4-CHS",
    "H", "H-CH2", "H1", "H2", "H3", "H4",
    "M", "M1", "M2",
Mathieu Giraud's avatar
Mathieu Giraud committed
167
]
168
169
170
FEATURES = FEATURES_VDJ + FEATURES_CLASSES

# Heavy-chain human IGH exons, ordered
171
CLASSES = [ "IGHA", "IGHM", "IGHD", "IGH2B", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2",
172
            "IGHGP" ]
Mathieu Giraud's avatar
Mathieu Giraud committed
173

174
175
176
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}

177
DOWNSTREAM_REGIONS=['[A-Z]{3}J', 'TRDD3']
Mikaël Salson's avatar
Mikaël Salson committed
178
UPSTREAM_REGIONS=['IGHD', 'TRDD', 'TRBD', 'TRDD2']
179
# Be careful, 'IGHD' regex for UPSTREAM_REGIONS also matches IGHD*0? constant regions.
180

181
SPECIES = {
182
    "Homo sapiens": 'homo-sapiens/',
183
    "Mus musculus": 'mus-musculus/',
184
185
    "Mus musculus_BALB/c": 'mus-musculus/',
    "Mus musculus_C57BL/6": 'mus-musculus/',
186
187
    "Rattus norvegicus": 'rattus-norvegicus/',
    "Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
188
    "Rattus norvegicus_BN; Sprague-Dawley": 'rattus-norvegicus/'
189
190
}

191
192
downstream_data = defaultdict(lambda: defaultdict(list))
upstream_data = defaultdict(lambda: defaultdict(list))
193

Mikaël Salson's avatar
Mikaël Salson committed
194
195
196
for l in sys.stdin:

    if ">" in l:
197
        current_files = []
198
199
        current_special = None

200
        species = l.split('|')[2].strip()
Mathieu Giraud's avatar
Mathieu Giraud committed
201
        feature = l.split('|')[4].strip()
202

Mathieu Giraud's avatar
Mathieu Giraud committed
203
        if species in SPECIES and feature in FEATURES:
204
            seq = l.split('|')[1]
205
            path = SPECIES[species]
206
207
208
209
210
211
212

            if feature in FEATURES_VDJ:
                system = seq[:4]
            else:
                system = seq[:seq.find("*")]
                if not system in CLASSES:
                    print "! Unknown class: ", system
213
                system = system.replace("IGH", "IGHC=")
214

215
216
217
            keys = [path + system]

            check_directory_exists(path)
218

Mikaël Salson's avatar
Mikaël Salson committed
219
220
            if system.startswith('IG') or system.startswith('TR'):

221
222
223
                if feature in FEATURES_VDJ:
                    store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
                    store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
224

225
226
227
228
229
230
231
232
                systems = get_split_files(seq, SPLIT_SEQUENCES)
                if systems:
                    keys = [path + s for s in systems]
                for key in keys:
                    if not (key in open_files):
                        name = '%s.fa' % (key)
                        open_files[key] = verbose_open_w(name)
                    current_files.append(open_files[key])
Mikaël Salson's avatar
Mikaël Salson committed
233

234
235
            if seq in SPECIAL_SEQUENCES:
                name = '%s.fa' % seq.replace('*', '-')
236
                current_special = verbose_open_w(name)
237

Mikaël Salson's avatar
Mikaël Salson committed
238

239
240
241
    if '>' not in l and current_files and feature == 'J-REGION':
        l = gap_j(l)

242
    for current_file in current_files:
Mikaël Salson's avatar
Mikaël Salson committed
243
244
            current_file.write(l)

245
246
    if current_special:
            current_special.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
247

248
249
250
251
for system in upstream_data:
    retrieve_genes(system+"_upstream.fa", upstream_data[system], -LENGTH_UPSTREAM)
for system in downstream_data:
    retrieve_genes(system+"_downstream.fa", downstream_data[system], LENGTH_DOWNSTREAM)