imgt-to-vdj.py 2.56 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
'''
Takes a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST,
and creates a .vdj file to be checked by should-vdj-to-tap.py

python imgt-to-vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/curated_IG.vdj
python imgt-to-vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/curated_TR.vdj
'''

import sys


def parse_gene_and_allele_to_vdj(s):
    '''
    Parse lines such as:
    Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
    and produce a .vdj line
    '''

    genes = []
    for term in s.replace(',', '').split():
21
        if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
22 23 24 25 26 27 28 29 30 31 32 33
            continue
        genes += [term]

    if not genes:
        return ''

    if len(genes) == 1:
        return genes[0]

    return '(%s)' % ','.join(genes)


34 35
class Result():
    '''Stores a tabulated result'''
36 37 38 39

    def __init__(self, l):

        self.d = {}
40
        self.result = self.parse(l)
41 42

        for i, data in enumerate(l.split('\t')):
43
            self.d[self.labels[i]] = data
44 45 46 47

    def __getitem__(self, key):
        return self.d[key]

48 49 50 51 52 53 54 55 56 57 58 59
    def __str__(self):
        return str(self.d)



class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
        return ('No result' not in l)

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
    def to_vdj(self):

        if not self.result:
            return 'no result'

        s = ''
        s += parse_gene_and_allele_to_vdj(self['V-GENE and allele'])
        s += ' '
        s += parse_gene_and_allele_to_vdj(self['D-GENE and allele'])
        s += ' '
        s += parse_gene_and_allele_to_vdj(self['J-GENE and allele'])
        return s



def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

        yield (fasta, vquest)


if __name__ == '__main__':

    for (header, result) in header_vquest_results(sys.argv[1], sys.argv[2]):
        # print "=========="
        print header.replace('>', '#')
        r = IMGT_VQUEST_Result(result)
        # print r
        print ">%s" % r.to_vdj()
        print