split-from-imgt.py 2.47 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Mikaël Salson's avatar
Mikaël Salson committed
3 4 5


import sys
6
import os
7 8 9 10 11 12 13 14 15 16

IMGT_LICENSE = '''
   # To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license: 
   # academic research only, provided that it is referred to IMGT®,
   # and cited as "IMGT®, the international ImMunoGeneTics information system® 
   # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). 
   # Lefranc, M.-P., IMGT®, the international ImMunoGeneTics database,
   # Nucl. Acids Res., 29, 207-209 (2001). PMID: 11125093
'''

17
print (IMGT_LICENSE)
18 19


Mathieu Giraud's avatar
Mathieu Giraud committed
20
# Parse lines in IMGT/GENE-DB such as:
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25
# >M12949|TRGV1*01|Homo sapiens|ORF|...

open_files = {}
current_file = None

26
def verbose_open_w(name):
27
    print (" ==> %s" % name)
28 29
    return open(name, 'w')

30 31 32 33 34 35 36 37 38 39
def get_split_files(seq, split_seq):
    for s_seq in split_seq.keys():
        if seq.find(s_seq) > -1:
            return split_seq[s_seq]
    return []

def check_directory_exists(path):
    if not(os.path.isdir(path)):
        os.mkdir(path)

40 41 42 43
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]

44 45 46
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}

47 48 49
SPECIES = {
    "Homo sapiens": './', 
    "Mus musculus": 'mus-musculus/',
50 51
    "Rattus norvegicus": 'rattus-norvegicus/',
    "Rattus norvegicus_BN/SsNHsdMCW": 'rattus-norvegicus/',
52 53
}

Mikaël Salson's avatar
Mikaël Salson committed
54 55 56
for l in sys.stdin:

    if ">" in l:
57
        current_files = []
58 59
        current_special = None

60 61 62
        species = l.split('|')[2].strip()

        if species in SPECIES and ("V-REGION" in l or "D-REGION" in l or "J-REGION" in l):
63
            seq = l.split('|')[1]
64
            path = SPECIES[species]
65
            system = seq[:4]
66 67 68
            keys = [path + system]

            check_directory_exists(path)
69

Mikaël Salson's avatar
Mikaël Salson committed
70 71
            if system.startswith('IG') or system.startswith('TR'):

72 73 74 75 76 77 78 79
                systems = get_split_files(seq, SPLIT_SEQUENCES)
                if systems:
                    keys = [path + s for s in systems]
                for key in keys:
                    if not (key in open_files):
                        name = '%s.fa' % (key)
                        open_files[key] = verbose_open_w(name)
                    current_files.append(open_files[key])
Mikaël Salson's avatar
Mikaël Salson committed
80

81 82
            if seq in SPECIAL_SEQUENCES:
                name = '%s.fa' % seq.replace('*', '-')
83
                current_special = verbose_open_w(name)
84

Mikaël Salson's avatar
Mikaël Salson committed
85

86
    for current_file in current_files:
Mikaël Salson's avatar
Mikaël Salson committed
87 88
            current_file.write(l)

89 90
    if current_special:
            current_special.write(l)
Mikaël Salson's avatar
Mikaël Salson committed
91