From c2b0dfc6e154dce18bbcacd8ad5de2af1b3a3a86 Mon Sep 17 00:00:00 2001 From: Mathieu Giraud Date: Sat, 27 Feb 2016 16:18:41 +0100 Subject: [PATCH] germline/split-from-imgt.py: gap J sequences according to Phe/Trp-Gly-X-Gly pattern There are currently no gapped J sequences distributed by IMGT. We create these sequences aligned on the Phe118/Trp118. --- germline/split-from-imgt.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 4d4c64d88..3efb96637 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -106,6 +106,29 @@ def retrieve_genes(filename, genes, additional_length): file.write(get_gene_sequence(gene, coord['imgt_name'], start, end)) +# Phe +# TrpGly Gly +j118 = re.compile('t..gg....gg.') + + +MAX_GAP_J = 36 # maximal position of Phe/Trp (36 for TRAJ52*01) +PHE_TRP_WARN_SIZE = 15 # small sequences are on a second line +PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern' + +def gap_j(seq): + '''Gap J sequences in order to align the Phe118/Trp118 codon''' + m = j118.search(seq) + + if not m: + if len(seq) > PHE_TRP_WARN_SIZE: + print "# %s in %s" % (PHE_TRP_WARN_MSG, seq) + seq = "# %s\n%s" % (PHE_TRP_WARN_MSG, seq) + return seq + + pos = m.start() + 1 # positions start at 1 + return (MAX_GAP_J - pos) * '.' + seq + + LENGTH_UPSTREAM=40 LENGTH_DOWNSTREAM=40 # Create isolated files for some sequences @@ -186,6 +209,9 @@ for l in sys.stdin: current_special = verbose_open_w(name) + if '>' not in l and current_files and feature == 'J-REGION': + l = gap_j(l) + for current_file in current_files: current_file.write(l) -- GitLab