diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 4d4c64d88fe4ffe684cda476855496cf4759a103..3efb96637ea5b3e06007268bbe0ae38c702279ad 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -106,6 +106,29 @@ def retrieve_genes(filename, genes, additional_length): file.write(get_gene_sequence(gene, coord['imgt_name'], start, end)) +# Phe +# TrpGly Gly +j118 = re.compile('t..gg....gg.') + + +MAX_GAP_J = 36 # maximal position of Phe/Trp (36 for TRAJ52*01) +PHE_TRP_WARN_SIZE = 15 # small sequences are on a second line +PHE_TRP_WARN_MSG = 'No Phe/Trp-Gly-X-Gly pattern' + +def gap_j(seq): + '''Gap J sequences in order to align the Phe118/Trp118 codon''' + m = j118.search(seq) + + if not m: + if len(seq) > PHE_TRP_WARN_SIZE: + print "# %s in %s" % (PHE_TRP_WARN_MSG, seq) + seq = "# %s\n%s" % (PHE_TRP_WARN_MSG, seq) + return seq + + pos = m.start() + 1 # positions start at 1 + return (MAX_GAP_J - pos) * '.' + seq + + LENGTH_UPSTREAM=40 LENGTH_DOWNSTREAM=40 # Create isolated files for some sequences @@ -186,6 +209,9 @@ for l in sys.stdin: current_special = verbose_open_w(name) + if '>' not in l and current_files and feature == 'J-REGION': + l = gap_j(l) + for current_file in current_files: current_file.write(l)