repseq_vdj.py 4.08 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4 5
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
  - or a results file produced by MiXCR
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11 12 13 14
'''

import sys

15
def genes_to_vdj(genes):
16 17 18 19 20 21 22 23 24
    if not genes:
        return ''

    if len(genes) == 1:
        return genes[0]

    return '(%s)' % ','.join(genes)


25 26 27
def N_to_vdj(s):
    return '/%s/' % s

28 29
def CDR3_to_vdj(s):
    return '{%s}' % s if s else ''
30

31 32
class Result():
    '''Stores a tabulated result'''
33 34 35 36

    def __init__(self, l):

        self.d = {}
37
        self.result = self.parse(l)
38 39

        for i, data in enumerate(l.split('\t')):
40
            self.d[self.labels[i]] = data
41 42 43 44

    def __getitem__(self, key):
        return self.d[key]

45 46 47 48
    def __str__(self):
        return str(self.d)


49 50 51 52 53 54 55 56 57 58 59 60
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
        return ('\t' in l.strip())

    def to_vdj(self):

        if not self.result:
            return 'no result'

        s = ''
61
        s += genes_to_vdj([self['Best V hit']])
62 63 64 65 66
        s += ' '

        if self['Best D hit']:
            s += N_to_vdj(self['N. Seq. VDJunction'])
            s += ' '
67
            s += genes_to_vdj([self['Best D hit']])
68 69 70 71 72 73
            s += ' '
            s += N_to_vdj(self['N. Seq. DJJunction'])
        else:
            s += N_to_vdj(self['N. Seq. VJJunction'])

        s += ' '
74
        s += genes_to_vdj([self['Best J hit']])
75

76 77 78
        s += ' '
        s += CDR3_to_vdj(self['AA. Seq. CDR3'])

79 80
        return s

81 82 83 84 85 86 87 88

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
        return ('No result' not in l)

89 90 91 92 93 94 95 96 97 98 99 100 101
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

102 103 104 105 106 107
    def to_vdj(self):

        if not self.result:
            return 'no result'

        s = ''
108
        s += genes_to_vdj(self.parse_gene_and_allele(self['V-GENE and allele']))
109
        s += ' '
110
        s += genes_to_vdj(self.parse_gene_and_allele(self['D-GENE and allele']))
111
        s += ' '
112
        s += genes_to_vdj(self.parse_gene_and_allele(self['J-GENE and allele']))
113 114 115 116

        s += ' '
        s += CDR3_to_vdj(self['AA JUNCTION'])

117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
        return s



def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
142
        r = IMGT_VQUEST_Result(vquest)
143
        yield (fasta.replace('>', ''), r.to_vdj())
144 145


146 147 148 149 150 151 152 153 154 155 156 157 158
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
    globals()['mixcr_labels'] = mixcr_first_line.split('\t')

    while True:
        l = f.next()
        result = MiXCR_Result(l)
        yield result['Description R1'], result.to_vdj()


159 160
if __name__ == '__main__':

161
    if 'mixcr' in sys.argv[1]:
162 163 164 165 166 167 168 169
        gen = header_mixcr_results(sys.argv[1])
    else:
        gen = header_vquest_results(sys.argv[1], sys.argv[2])

    # output .vdj data
    for (header, result) in gen:
        print "#%s" % header
        print ">%s" % result
170
        print