gb-to-should.py 1.5 KB
Newer Older
1 2
'''Rough conversion from .gb to .should-vdj.fa'''

3 4
# python gb-to-should.py -t "[TRB+]" *Db*.gb
# python gb-to-should.py -t "[TRA+D]" *29*.gb
5 6 7

import sys

8
import argparse
9

10 11 12 13 14 15 16
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--tag', '-t', default='', help='tag to add at the end of the header')
parser.add_argument('file', nargs='+', help='''.gb files''')

args = parser.parse_args()

def parse_gb(stream):
17
    phase = 0
18 19 20
    labels = []
    seqs = []

21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
    for l in stream:

        l = l.strip()

        if l.startswith("FEATURES"):
            phase = 1
            continue
    
        if l == "ORIGIN":
            phase = 2
            continue
        
        if l == "//":
            phase = 3
            continue
        
        if not phase in [1, 2]:
            continue
    
        if phase == 1 and l.startswith('/label'):
41 42
            label = l.split('=')[1]
            if not 'TR' in label:
43
                continue
44
            labels += [label]
45 46 47 48
            continue
    
        if phase == 2:
            seq = ''.join(l.split(' ')[1:])
49
            seqs += [seq]
50 51 52 53
            continue
        
    # print "! Not parsed:", l

54 55
    return labels, seqs

56

57 58 59
def output_should_vdj(f, labels, seqs):
    f.write('>%s\n' % ' '.join(labels))
    f.write('%s\n\n' % '\n'.join(seqs))
60 61


62 63 64 65 66 67
for f in args.file:
    labels, seqs = parse_gb(open(f))

    if args.tag:
        labels += [ args.tag ]

68
    sys.stdout.write('#%s\n' % f)
69 70
    output_should_vdj(sys.stdout, labels, seqs)