split-from-imgt.py 14 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Mikaël Salson's avatar
Mikaël Salson committed
3 4 5


import sys
6
import os
7
import urllib
8
from collections import defaultdict, OrderedDict
9
import re
10

11
import ncbi
12

13 14
GENES_SEQ_FROM_NCBI = False

15 16 17 18 19 20 21 22 23
IMGT_LICENSE = '''
   # To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license: 
   # academic research only, provided that it is referred to IMGT®,
   # and cited as "IMGT®, the international ImMunoGeneTics information system® 
   # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). 
   # Lefranc, M.-P., IMGT®, the international ImMunoGeneTics database,
   # Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''

24 25 26
def remove_allele(name):
    return name.split('*')[0]

Mathieu Giraud's avatar
Mathieu Giraud committed
27
# Parse lines in IMGT/GENE-DB such as:
Mikaël Salson's avatar
Mikaël Salson committed
28 29
# >M12949|TRGV1*01|Homo sapiens|ORF|...

30

Mikaël Salson's avatar
Mikaël Salson committed
31 32
current_file = None

33
def verbose_open_w(name):
34
    print (" ==> %s" % name)
35 36
    return open(name, 'w')

37 38 39 40 41 42 43 44 45 46
class KeyDefaultDict(defaultdict):
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError((key,))
        self[key] = value = self.default_factory(key)
        return value

open_files = KeyDefaultDict(lambda key: verbose_open_w('%s.fa' % key))


47 48 49 50 51 52 53 54 55 56
def get_split_files(seq, split_seq):
    for s_seq in split_seq.keys():
        if seq.find(s_seq) > -1:
            return split_seq[s_seq]
    return []

def check_directory_exists(path):
    if not(os.path.isdir(path)):
        os.mkdir(path)

57 58 59
def gene_matches(string, list_regex):
    '''
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'IGHD'])
60
    ['IGHD']
61
    >>> gene_matches('>M994641|IGHD1-18*01|Toto', ['TRGV', 'TRGD'])
62
    []
63
    >>> gene_matches('>M994641|IGHJ4*01|Toto', ['[A-Z]{3}J'])
64 65 66
    ['IGHJ']
    >>> gene_matches('>M22153|TRDD2*01|Homo sapiens|F|', ['TRDD', 'IGH', 'TRDD2'])
    ['TRDD', 'TRDD2']
67
    '''
68
    results = []
69
    for regex in list_regex:
70 71
        match = re.search(regex, string)
        if match <> None:
72 73
            results.append(match.group(0))
    return results
74 75 76 77

def get_gene_coord(imgt_line):
    '''
    >>> line = '>X15272|TRGV4*01|Homo sapiens|F|V-REGION|406..705|300 nt|1| | | | |300+0=300| |rev-compl|'
78 79
    >>> get_gene_coord(line)[0] == 'X15272'
    True
80
    >>> get_gene_coord(line)[1] == {'from': 406, 'to': 705, 'imgt_data': 'TRGV4*01|Homo sapiens|F|V-REGION', 'imgt_name': 'TRGV4*01', 'species': 'Homo sapiens', 'seq': ''}
81 82 83 84
    True
    '''
    elements = imgt_line.split('|')
    assert len(elements) >= 6
85 86
    if elements[5].find('..') == -1:
        return None, None
87
    start, end = elements[5].split('..')
88 89
    if start.find(',') > -1:
        start = start[2:]
90 91
    if end.find(',') > -1:
        end = end.split(',')[0]
92 93
    return elements[0][1:], {'from': int(start),
                             'to': int(end),
94
                             'species': elements[2],
95
                             'imgt_name': elements[1],
96 97
                             'imgt_data': '|'.join(elements[1:5]),
                             'seq': ''}
98

99 100 101 102 103 104 105 106 107 108 109 110 111
def paste_updown_on_fasta(fasta, up, down):
    '''
    Put upstream and/or downstream data on an existing FASTA sequences
    >>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', 'GGGG')
    '>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
    >>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n', '', 'GGGG')
    '>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\nGGGG\\n'
    >>> paste_updown_on_fasta('>seq\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT', 'CCCC', '')
    '>seq\\nCCCC\\nAAAAAAAAAAAAAAAAAAA\\nTTTTTTTTTTT\\n'
    '''
    lines = fasta.split('\n')
    return lines[0]+'\n' + (up+'\n' if up else '') + '\n'.join(filter(None, lines[1:])) + '\n'\
        + (down+'\n' if down else '')
112

113 114
def check_imgt_ncbi_consistency(imgt_info, imgt_data, ncbi_target, ncbi_start, ncbi_end):
    if abs(imgt_info['from'] - imgt_info['to']) != abs(ncbi_start - ncbi_end):
115
        print >>sys.stderr,"WARNING: Length for %s differ between IMGT (%d) and NCBI (%d)" % (imgt_info['imgt_name'], abs(imgt_info['from'] - imgt_info['to'])+1, abs(ncbi_start - ncbi_end)+1)
116 117 118 119
    else:
        # Check that sequences are identical
        ncbi_seq = ncbi.get_gene_sequence(ncbi_target, '', ncbi_start, ncbi_end, 0).split('\n')[1:]
        gene_lines = imgt_data.split('\n')[1:]
120 121 122 123 124
        if gene_lines[0].startswith('#'):
            gene_lines = gene_lines[1:]
        imgt_seq = ''.join(gene_lines).upper().replace('.', '')
        ncbi_seq = ''.join(ncbi_seq).upper()
        if imgt_seq != ncbi_seq:
125 126 127 128 129 130 131
            print >>sys.stderr, "WARNING: Sequences for %s differ between IMGT and NCBI\n%s" % (imgt_info['imgt_name'], imgt_seq)
            for i, letter in enumerate(ncbi_seq):
                if letter == imgt_seq[i]:
                    sys.stderr.write('.')
                else:
                    sys.stderr.write(letter)
            sys.stderr.write('\n')
132

133
def store_data_if_updownstream(fasta_header, path, data, genes):
134
    paths = []                  # A given sequence can be stored in several files
135
    for gene in gene_matches(fasta_header, genes):
136
        gene_name, gene_coord = get_gene_coord(fasta_header)
137

138
        if gene_name:
139
            data[path+'/'+gene].append((gene_name, gene_coord))
140 141
            paths.append(path+'/'+gene)
    return paths
142
    
143
def retrieve_genes(f, genes, tag, additional_length, gene_list):
144 145 146 147
    for info in genes:
        (gene, coord) = info
        # try to extract from genome
        gene_id = gene_list.get_gene_id_from_imgt_name(coord['species'], coord['imgt_name'])
148

149 150 151 152 153 154 155 156 157
        allele_additional_length = 0
        
        if gene_id:
            try:
                (target, start, end) = ncbi.get_gene_positions(gene_id)
            except KeyError:
                print('! No positions for %s (%s: %s)' % (gene_id, gene, str(coord)))
                allele_additional_length = additional_length
                gene_id = None
158

159 160 161 162
        # gene: is the name of the sequence where the VDJ gene was identified according to IMGT. The gene is just a part of the sequence
        # gene_id: is the NCBI ID of the VDJ gene
        # target: is the NCBI ID of the chromosome

163 164 165 166 167
        if GENES_SEQ_FROM_NCBI:
            gene_data = ncbi.get_gene_sequence(gene, coord['imgt_data'] + tag, coord['from'], coord['to'], allele_additional_length)
        else:
            # IMGT
            gene_data = coord['seq']
168

169
        if gene_id:
170 171 172 173
            # Check consistency for *01 allele
            if coord['imgt_name'].endswith('*01'):
                check_imgt_ncbi_consistency(coord, gene_data, target, start, end)

174
            up_down = ncbi.get_updownstream_sequences(target, start, end, additional_length)
175 176
            # We put the up and downstream data before and after the sequence we retrieved previously
            gene_data = paste_updown_on_fasta(gene_data, up_down[0], up_down[1])
177

178

179 180 181
        # post-process gene_data
        if coord['imgt_data'].split('|')[-1] == FEATURE_J_REGION:
            gene_lines = gene_data.split('\n')
182
            gene_lines[1] = gap_j(gene_lines[1].lower())
183
            gene_data = '\n'.join(gene_lines)
184

185
        f.write(gene_data)
186 187


188 189 190 191 192 193 194 195 196
#                  Phe
#                  TrpGly   Gly
j118 = re.compile('t..gg....gg.')


MAX_GAP_J = 36          # maximal position of Phe/Trp (36 for TRAJ52*01)
PHE_TRP_WARN_SIZE = 15  # small sequences are on a second line
PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern'

197 198 199 200 201 202 203 204 205
CUSTOM_118 = { '': 0    # custom position of 118 in sequences without the Trp-Gly-X-Gly pattern
    #                               118
    #                               |..
    ,                       'gcacatgtttggcagcaagacccagcccactgtctta':         8 # IGLJ-C/OR18*01
    ,    'ggttttcagatggccagaagctgctctttgcaaggggaaccatgttaaaggtggatctta':    27 # TRAJ16*01
    , 'agatgcgtgacagctatgagaagctgatatttggaaaggagacatgactaactgtgaagc':       30 # TRAJ51*01
    ,    'ggtaccgggttaataggaaactgacatttggagccaacactagaggaatcatgaaactca':    27 # TRAJ61*01
    ,       'ataccactggttggttcaagatatttgctgaagggactaagctcatagtaacttcacctg': 24 # TRGJP1*01
    ,       'atagtagtgattggatcaagacgtttgcaaaagggactaggctcatagtaacttcgcctg': 24 # TRGJP2*01
206 207
  #                                 t..gg....gg.                               # Regexp
  #  .....ggaaggaaggaaacaggaaatttacatttggaatggggacgcaagtgagagtga               # TRAJ59*01 (ok)
208 209
  #  ,         'ctgagaggcgctgctgggcgtctgggcggaggactcctggttctgg':               # TRBJ2-2P*01 ?
  #  ,        'ctcctacgagcagtacgtcgggccgggcaccaggctcacggtcacag':               # TRBJ2-7*02 ?
210 211
}

212 213 214
def gap_j(seq):
    '''Gap J sequences in order to align the Phe118/Trp118 codon'''

215 216
    seqs = seq.strip()

217
    pos = None
218

219 220 221 222 223 224 225 226
    for custom_seq in CUSTOM_118:
        if not custom_seq:
            continue
        if seqs.startswith(custom_seq):
            print "# Custom 118 position in %s" % seqs
            pos = CUSTOM_118[custom_seq]

    if pos is None:
227
        m = j118.search(seq, re.IGNORECASE)
228 229 230 231 232 233 234 235

        if not m:
            if len(seq) > PHE_TRP_WARN_SIZE:
                print "# %s in %s" % (PHE_TRP_WARN_MSG, seq)
                seq = "# %s\n%s" % (PHE_TRP_WARN_MSG, seq)
            return seq

        pos = m.start() + 1 # positions start at 1
236 237 238 239

    return (MAX_GAP_J - pos) * '.' + seq


240 241
LENGTH_UPSTREAM=40
LENGTH_DOWNSTREAM=40
242 243 244 245
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]

246 247 248
FEATURE_J_REGION = 'J-REGION'

FEATURES_VDJ = [ "V-REGION", "D-REGION", FEATURE_J_REGION ]
249 250 251 252
FEATURES_CLASSES = [
    "CH1", "CH2", "CH3", "CH3-CHS", "CH4-CHS",
    "H", "H-CH2", "H1", "H2", "H3", "H4",
    "M", "M1", "M2",
253
]
254 255 256
FEATURES = FEATURES_VDJ + FEATURES_CLASSES

# Heavy-chain human IGH exons, ordered
257
CLASSES = [ "IGHA", "IGHM", "IGHD", "IGH2B", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2",
258
            "IGHGP" ]
259

260 261 262
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}

263
DOWNSTREAM_REGIONS=['[A-Z]{3}J', 'TRDD3']
Mikaël Salson's avatar
Mikaël Salson committed
264
UPSTREAM_REGIONS=['IGHD', 'TRDD', 'TRBD', 'TRDD2']
265
# Be careful, 'IGHD' regex for UPSTREAM_REGIONS also matches IGHD*0? constant regions.
266

267 268 269
TAG_DOWNSTREAM='+down'
TAG_UPSTREAM='+up'

270
SPECIES = {
271
    "Homo sapiens": 'homo-sapiens/',
272
    "Mus musculus": 'mus-musculus/',
273 274
    "Mus musculus_BALB/c": 'mus-musculus/',
    "Mus musculus_C57BL/6": 'mus-musculus/',
275 276
    "Rattus norvegicus": 'rattus-norvegicus/',
    "Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
277
    "Rattus norvegicus_BN; Sprague-Dawley": 'rattus-norvegicus/'
278 279
}

280 281 282 283 284 285

class OrderedDefaultListDict(OrderedDict):
    def __missing__(self, key):
        self[key] = value = []
        return value

286

287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310

class IMGTGENEDBGeneList():
    '''
    Parse lines such as
    'Homo sapiens;TRGJ2;F;Homo sapiens T cell receptor gamma joining 2;1;7;7p14;M12961;6969;'

    >>> gl = IMGTGENEDBGeneList('IMGTGENEDB-GeneList')
    >>> gl.get_gene_id_from_imgt_name('Homo sapiens', 'TRGJ2*01')
    '6969'
    '''

    def __init__(self, f):

        self.data = defaultdict(str)

        for l in open(f):
            ll = l.split(';')
            species, name, gene_id = ll[0], ll[1], ll[-2]
            self.data[species, name] = gene_id

    def get_gene_id_from_imgt_name(self, species, name):
        return self.data[species, remove_allele(name)]


311

312
def split_IMGTGENEDBReferenceSequences(f, gene_list):
313

314 315
    downstream_data = OrderedDefaultListDict()
    upstream_data = OrderedDefaultListDict()
316

317
    for l in open(ReferenceSequences):
Mikaël Salson's avatar
Mikaël Salson committed
318

319
        # New sequence: compute 'current_files' and stores up/downstream_data[]
320

321 322 323
        if ">" in l:
            current_files = []
            current_special = None
324
            key_upstream, key_downstream = ([], [])
325

326 327
            species = l.split('|')[2].strip()
            feature = l.split('|')[4].strip()
328

329 330 331
            if species in SPECIES and feature in FEATURES:
                seq = l.split('|')[1]
                path = SPECIES[species]
332

333 334 335 336 337 338 339
                if feature in FEATURES_VDJ:
                    system = seq[:4]
                else:
                    system = seq[:seq.find("*")]
                    if not system in CLASSES:
                        print "! Unknown class: ", system
                    system = system.replace("IGH", "IGHC=")
340

341
                keys = [path + system]
342

343
                check_directory_exists(path)
344

345
                if system.startswith('IG') or system.startswith('TR'):
Mikaël Salson's avatar
Mikaël Salson committed
346

347
                    if feature in FEATURES_VDJ:
348 349
                        key_downstream = store_data_if_updownstream(l, path, downstream_data, DOWNSTREAM_REGIONS)
                        key_upstream = store_data_if_updownstream(l, path, upstream_data, UPSTREAM_REGIONS)
350

351 352 353 354 355
                    systems = get_split_files(seq, SPLIT_SEQUENCES)
                    if systems:
                        keys = [path + s for s in systems]
                    for key in keys:
                        current_files.append(open_files[key])
Mikaël Salson's avatar
Mikaël Salson committed
356

357 358 359
                if seq in SPECIAL_SEQUENCES:
                    name = '%s.fa' % seq.replace('*', '-')
                    current_special = verbose_open_w(name)
360

Mikaël Salson's avatar
Mikaël Salson committed
361

362 363 364 365 366 367
        for key in key_downstream:
            downstream_data[key][-1][1]['seq'] += l
        for key in key_upstream:
            upstream_data[key][-1][1]['seq'] += l


368 369 370 371 372
        # Possibly gap J_REGION

        if '>' not in l and current_files and feature == FEATURE_J_REGION:
            l = gap_j(l)

373
        # Dump 'l' to the concerned files
374

375 376
        for current_file in current_files:
                current_file.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
377

378 379
        if current_special:
                current_special.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
380

381
        # End, loop to next 'l'
382 383


384
    # Dump up/downstream data
385

386 387 388
    for system in upstream_data:
        f = verbose_open_w(system + TAG_UPSTREAM + '.fa')
        retrieve_genes(f, upstream_data[system], TAG_UPSTREAM, -LENGTH_UPSTREAM, gene_list)
389

390 391 392
    for system in downstream_data:
        f = verbose_open_w(system + TAG_DOWNSTREAM + '.fa')
        retrieve_genes(f, downstream_data[system], TAG_DOWNSTREAM, LENGTH_DOWNSTREAM, gene_list)
393

394

395 396 397 398 399



if __name__ == '__main__':

400 401 402 403 404
    if sys.argv[1] == '--test':
        import doctest
        doctest.testmod()
    else:
        print (IMGT_LICENSE)
405

406 407
        ReferenceSequences = sys.argv[1]
        GeneList = sys.argv[2]
408

409 410
        gl = IMGTGENEDBGeneList(GeneList)
        split_IMGTGENEDBReferenceSequences(ReferenceSequences, gl)
411