split-from-imgt.py 9.14 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Mikaël Salson's avatar
Mikaël Salson committed
3 4 5


import sys
6
import os
7 8 9
import urllib
from collections import defaultdict
import re
10 11 12 13 14 15 16 17 18 19

IMGT_LICENSE = '''
   # To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license: 
   # academic research only, provided that it is referred to IMGT®,
   # and cited as "IMGT®, the international ImMunoGeneTics information system® 
   # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). 
   # Lefranc, M.-P., IMGT®, the international ImMunoGeneTics database,
   # Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''

20
print (IMGT_LICENSE)
21

22
NCBI_API = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&retmode=text'+'&id=%s&from=%s&to=%s'
23

Mathieu Giraud's avatar
Mathieu Giraud committed
24
# Parse lines in IMGT/GENE-DB such as:
Mikaël Salson's avatar
Mikaël Salson committed
25 26 27 28 29
# >M12949|TRGV1*01|Homo sapiens|ORF|...

open_files = {}
current_file = None

30
def verbose_open_w(name):
31
    print (" ==> %s" % name)
32 33
    return open(name, 'w')

34 35 36 37 38 39 40 41 42 43
def get_split_files(seq, split_seq):
    for s_seq in split_seq.keys():
        if seq.find(s_seq) > -1:
            return split_seq[s_seq]
    return []

def check_directory_exists(path):
    if not(os.path.isdir(path)):
        os.mkdir(path)

44 45 46
def gene_matches(string, list_regex):
    '''
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'IGHD'])
47
    ['IGHD']
48
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'TRGD'])
49
    []
50
    >>> gene_matches('>M994641|IGHJ4*01|Toto', ['[A-Z]{3}J'])
51 52 53
    ['IGHJ']
    >>> gene_matches('>M22153|TRDD2*01|Homo sapiens|F|', ['TRDD', 'IGH', 'TRDD2'])
    ['TRDD', 'TRDD2']
54
    '''
55
    results = []
56
    for regex in list_regex:
57 58
        match = re.search(regex, string)
        if match <> None:
59 60
            results.append(match.group(0))
    return results
61 62 63 64

def get_gene_coord(imgt_line):
    '''
    >>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
65 66
    >>> get_gene_coord(line)[0] == 'X15272'
    True
67
    >>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01'}
68 69 70 71
    True
    '''
    elements = imgt_line.split('|')
    assert len(elements) >= 6
72 73
    if elements[5].find('..') == -1:
        return None, None
74
    start, end = elements[5].split('..')
75 76
    if start.find(',') > -1:
        start = start[2:]
77 78
    if end.find(',') > -1:
        end = end.split(',')[0]
79 80
    return elements[0][1:], {'from': int(start),
                             'to': int(end),
81 82
                             'imgt_name': elements[1],
                             'imgt_data': '|'.join(elements[1:5])}
83 84 85 86 87 88

def get_gene_sequence(gene, other_gene_name, start, end):
    '''
    Return the gene sequences between positions start and end (included).
    '''
    fasta_string = urllib.urlopen(NCBI_API % (gene, start, end)).read()
89
    return re.sub('(>\S*) ', r'\1|'+other_gene_name+'|', fasta_string)
90

91
def store_data_if_updownstream(fasta_header, path, data, genes):
92
    for gene in gene_matches(fasta_header, genes):
93
        gene_name, gene_coord = get_gene_coord(fasta_header)
94 95
        if gene_name:
            data[path+'/'+gene][gene_name].append(gene_coord)
96
    
97
def retrieve_genes(filename, genes, tag, additional_length):
98
    f = verbose_open_w(filename)
99
    for gene in genes:
100 101 102 103 104 105 106
        for coord in genes[gene]:
            start = coord['from']
            end = coord['to']
            if additional_length > 0:
                end += additional_length
            elif additional_length < 0:
                start = max(1, start + additional_length)
107
            gene_data = get_gene_sequence(gene, coord['imgt_data'] + tag, start, end)
108 109 110 111 112
            if coord['imgt_data'].split('|')[-1] == FEATURE_J_REGION:
                gene_lines = gene_data.split('\n')
                gene_lines[1] = gap_j(gene_lines[1].lower())
                gene_data = '\n'.join(gene_lines)

113
            f.write(gene_data)
114 115


116 117 118 119 120 121 122 123 124
#                  Phe
#                  TrpGly   Gly
j118 = re.compile('t..gg....gg.')


MAX_GAP_J = 36          # maximal position of Phe/Trp (36 for TRAJ52*01)
PHE_TRP_WARN_SIZE = 15  # small sequences are on a second line
PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern'

125 126 127 128 129 130 131 132 133
CUSTOM_118 = { '': 0    # custom position of 118 in sequences without the Trp-Gly-X-Gly pattern
    #                               118
    #                               |..
    ,                       'gcacatgtttggcagcaagacccagcccactgtctta':         8 # IGLJ-C/OR18*01
    ,    'ggttttcagatggccagaagctgctctttgcaaggggaaccatgttaaaggtggatctta':    27 # TRAJ16*01
    , 'agatgcgtgacagctatgagaagctgatatttggaaaggagacatgactaactgtgaagc':       30 # TRAJ51*01
    ,    'ggtaccgggttaataggaaactgacatttggagccaacactagaggaatcatgaaactca':    27 # TRAJ61*01
    ,       'ataccactggttggttcaagatatttgctgaagggactaagctcatagtaacttcacctg': 24 # TRGJP1*01
    ,       'atagtagtgattggatcaagacgtttgcaaaagggactaggctcatagtaacttcgcctg': 24 # TRGJP2*01
134 135
  #                                 t..gg....gg.                               # Regexp
  #  .....ggaaggaaggaaacaggaaatttacatttggaatggggacgcaagtgagagtga               # TRAJ59*01 (ok)
136 137
  #  ,         'ctgagaggcgctgctgggcgtctgggcggaggactcctggttctgg':               # TRBJ2-2P*01 ?
  #  ,        'ctcctacgagcagtacgtcgggccgggcaccaggctcacggtcacag':               # TRBJ2-7*02 ?
138 139
}

140 141 142
def gap_j(seq):
    '''Gap J sequences in order to align the Phe118/Trp118 codon'''

143 144
    seqs = seq.strip()

145
    pos = None
146

147 148 149 150 151 152 153 154
    for custom_seq in CUSTOM_118:
        if not custom_seq:
            continue
        if seqs.startswith(custom_seq):
            print "# Custom 118 position in %s" % seqs
            pos = CUSTOM_118[custom_seq]

    if pos is None:
155 156 157 158 159 160 161 162 163
        m = j118.search(seq)

        if not m:
            if len(seq) > PHE_TRP_WARN_SIZE:
                print "# %s in %s" % (PHE_TRP_WARN_MSG, seq)
                seq = "# %s\n%s" % (PHE_TRP_WARN_MSG, seq)
            return seq

        pos = m.start() + 1 # positions start at 1
164 165 166 167

    return (MAX_GAP_J - pos) * '.' + seq


168 169
LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
170 171 172 173
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]

174 175 176
FEATURE_J_REGION = 'J-REGION'

FEATURES_VDJ = [ "V-REGION", "D-REGION", FEATURE_J_REGION ]
177 178 179 180
FEATURES_CLASSES = [
    "CH1", "CH2", "CH3", "CH3-CHS", "CH4-CHS",
    "H", "H-CH2", "H1", "H2", "H3", "H4",
    "M", "M1", "M2",
181
]
182 183 184
FEATURES = FEATURES_VDJ + FEATURES_CLASSES

# Heavy-chain human IGH exons, ordered
185
CLASSES = [ "IGHA", "IGHM", "IGHD", "IGH2B", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2",
186
            "IGHGP" ]
187

188 189 190
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}

191
DOWNSTREAM_REGIONS=['[A-Z]{3}J', 'TRDD3']
Mikaël Salson's avatar
Mikaël Salson committed
192
UPSTREAM_REGIONS=['IGHD', 'TRDD', 'TRBD', 'TRDD2']
193
# Be careful, 'IGHD' regex for UPSTREAM_REGIONS also matches IGHD*0? constant regions.
194

195 196 197
TAG_DOWNSTREAM='+down'
TAG_UPSTREAM='+up'

198
SPECIES = {
199
    "Homo sapiens": 'homo-sapiens/',
200
    "Mus musculus": 'mus-musculus/',
201 202
    "Mus musculus_BALB/c": 'mus-musculus/',
    "Mus musculus_C57BL/6": 'mus-musculus/',
203 204
    "Rattus norvegicus": 'rattus-norvegicus/',
    "Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
205
    "Rattus norvegicus_BN; Sprague-Dawley": 'rattus-norvegicus/'
206 207
}

208 209
downstream_data = defaultdict(lambda: defaultdict(list))
upstream_data = defaultdict(lambda: defaultdict(list))
210

Mikaël Salson's avatar
Mikaël Salson committed
211 212 213
for l in sys.stdin:

    if ">" in l:
214
        current_files = []
215 216
        current_special = None

217
        species = l.split('|')[2].strip()
218
        feature = l.split('|')[4].strip()
219

220
        if species in SPECIES and feature in FEATURES:
221
            seq = l.split('|')[1]
222
            path = SPECIES[species]
223 224 225 226 227 228 229

            if feature in FEATURES_VDJ:
                system = seq[:4]
            else:
                system = seq[:seq.find("*")]
                if not system in CLASSES:
                    print "! Unknown class: ", system
230
                system = system.replace("IGH", "IGHC=")
231

232 233 234
            keys = [path + system]

            check_directory_exists(path)
235

Mikaël Salson's avatar
Mikaël Salson committed
236 237
            if system.startswith('IG') or system.startswith('TR'):

238 239 240
                if feature in FEATURES_VDJ:
                    store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
                    store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
241

242 243 244 245 246 247 248 249
                systems = get_split_files(seq, SPLIT_SEQUENCES)
                if systems:
                    keys = [path + s for s in systems]
                for key in keys:
                    if not (key in open_files):
                        name = '%s.fa' % (key)
                        open_files[key] = verbose_open_w(name)
                    current_files.append(open_files[key])
Mikaël Salson's avatar
Mikaël Salson committed
250

251 252
            if seq in SPECIAL_SEQUENCES:
                name = '%s.fa' % seq.replace('*', '-')
253
                current_special = verbose_open_w(name)
254

Mikaël Salson's avatar
Mikaël Salson committed
255

256
    if '>' not in l and current_files and feature == FEATURE_J_REGION:
257 258
        l = gap_j(l)

259
    for current_file in current_files:
Mikaël Salson's avatar
Mikaël Salson committed
260 261
            current_file.write(l)

262 263
    if current_special:
            current_special.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
264

265
for system in upstream_data:
266
    retrieve_genes(system + TAG_UPSTREAM + '.fa', upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM)
267
for system in downstream_data:
268
    retrieve_genes(system + TAG_DOWNSTREAM + '.fa', downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM)