Commit b8ddb15d authored by Mikaël Salson's avatar Mikaël Salson

Germline: Allow to split some sequences in many files

IMGT considers some sequences as belonging to several germlines. This is the
case for example for TRAV../DV[45678] sequences that are shared by TRAV and
TRDV sequences. Therefore they should be put in both germlines.
parent ef1f9a40
......@@ -15,7 +15,7 @@ $ Check md5 in germline/, sequences split from IMGT
1:96ee3f1167c74c94a2a2304505235efc TRBV.fa
1:1763bd22672a4954d500d434fd36b933 TRDD.fa
1:ac5b38a330c29aa73fd735f91169480c TRDJ.fa
1:d00151951fc48700ffdbe4a600b9ee67 TRDV.fa
1:32e76213a6a263ef616fe60344e4e9b1 TRDV.fa
1:767a3cbbd8d9299c10cbcf3863c6f4ef TRGJ.fa
1:d143aabd90f98e239f1ca5f1d2b82a5d TRGV.fa
......
......@@ -3,7 +3,7 @@
import sys
import os
IMGT_LICENSE = '''
# To use the IMGT germline databases (IMGT/GENE-DB), you have to agree to IMGT license:
......@@ -27,10 +27,23 @@ def verbose_open_w(name):
print " ==> %s" % name
return open(name, 'w')
def get_split_files(seq, split_seq):
for s_seq in split_seq.keys():
if seq.find(s_seq) > -1:
return split_seq[s_seq]
return []
def check_directory_exists(path):
if not(os.path.isdir(path)):
os.mkdir(path)
# Create isolated files for some sequences
SPECIAL_SEQUENCES = [
]
# Split sequences in several files
SPLIT_SEQUENCES = {'/DV': ['TRAV', 'TRDV']}
SPECIES = {
"Homo sapiens": './',
"Mus musculus": 'mus-musculus/',
......@@ -39,7 +52,7 @@ SPECIES = {
for l in sys.stdin:
if ">" in l:
current_file = None
current_files = []
current_special = None
species = l.split('|')[2].strip()
......@@ -48,23 +61,27 @@ for l in sys.stdin:
seq = l.split('|')[1]
path = SPECIES[species]
system = seq[:4]
key = path + system
keys = [path + system]
check_directory_exists(path)
if system.startswith('IG') or system.startswith('TR'):
if key in open_files:
current_file = open_files[key]
else:
name = '%s%s.fa' % (path, system)
current_file = verbose_open_w(name)
open_files[key] = current_file
systems = get_split_files(seq, SPLIT_SEQUENCES)
if systems:
keys = [path + s for s in systems]
for key in keys:
if not (key in open_files):
name = '%s.fa' % (key)
open_files[key] = verbose_open_w(name)
current_files.append(open_files[key])
if seq in SPECIAL_SEQUENCES:
name = '%s.fa' % seq.replace('*', '-')
current_special = verbose_open_w(name)
if current_file:
for current_file in current_files:
current_file.write(l)
if current_special:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment