repseq_vdj.py 4.71 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4 5
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
  - or a results file produced by MiXCR
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11 12 13 14 15
'''

import sys


16 17 18
V = 'V'
D = 'D'
J = 'J'
19

20 21 22
N1 = 'N1'
N2 = 'N2'
N = 'N'
23

24
JUNCTION = 'JUNCTION'
25

26

27 28
class VDJ_Formatter():
    '''Stores fields and outputs a .vdj line'''
29

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    def genes_to_vdj(self, genes):
        if not genes:
            return ''

        if len(genes) == 1:
            return genes[0]

        return '(%s)' % ','.join(genes)


    def N_to_vdj(self, s):
        return '/%s/' % s

    def CDR3_to_vdj(self, s):
        return '{%s}' % s if s else ''

    def to_vdj(self):
        if not self.result:
            return 'no result'

        s = ''
        s += self.genes_to_vdj(self.vdj[V])
        s += ' '

        if D in self.vdj:
            if N1 in self.vdj:
                s += self.N_to_vdj(self.vdj[N1])
                s += ' '
            s += self.genes_to_vdj(self.vdj[D])
            if N2 in self.vdj:
                s += ' '
                s += self.N_to_vdj(self.vdj[N2])
        else:
            if N in self.vdj:
                s += self.N_to_vdj(self.vdj[N])

        s += ' '
        s += self.genes_to_vdj(self.vdj[J])

        if JUNCTION in self.vdj:
            s += ' '
            s += self.CDR3_to_vdj(self.vdj[JUNCTION])

        return s


class Result(VDJ_Formatter):
77
    '''Stores a tabulated result'''
78 79 80 81

    def __init__(self, l):

        self.d = {}
82
        self.vdj = {}
83
        self.result = self.parse(l)
84 85

        for i, data in enumerate(l.split('\t')):
86
            self.d[self.labels[i]] = data
87

88 89 90
        if self.result:
            self.populate()

91 92 93
    def __getitem__(self, key):
        return self.d[key]

94 95 96 97
    def __str__(self):
        return str(self.d)


98 99 100 101 102 103
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
        return ('\t' in l.strip())

104 105
    def populate(self):
        self.vdj[V] = [self['Best V hit']]
106
        if self['Best D hit']:
107 108
            self.vdj[D] = [self['Best D hit']]
        self.vdj[J] = [self['Best J hit']]
109

110 111 112
        self.vdj[N1] = self['N. Seq. VDJunction']
        self.vdj[N2] = self['N. Seq. DJJunction']
        self.vdj[N] = self['N. Seq. VJJunction']
113

114
        self.vdj[JUNCTION] = self['AA. Seq. CDR3']
115

116

117 118 119 120 121 122 123 124

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
        return ('No result' not in l)

125 126 127 128 129 130 131 132 133 134 135 136 137
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

138 139 140 141
    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self['V-GENE and allele'])
        self.vdj[D] = self.parse_gene_and_allele(self['D-GENE and allele'])
        self.vdj[J] = self.parse_gene_and_allele(self['J-GENE and allele'])
142

143
        self.vdj[JUNCTION] = self['AA JUNCTION']
144

145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169



def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
170
        r = IMGT_VQUEST_Result(vquest)
171
        yield (fasta.replace('>', ''), r.to_vdj())
172 173


174 175 176 177 178 179 180 181 182 183 184 185 186
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
    globals()['mixcr_labels'] = mixcr_first_line.split('\t')

    while True:
        l = f.next()
        result = MiXCR_Result(l)
        yield result['Description R1'], result.to_vdj()


187 188
if __name__ == '__main__':

189
    if 'mixcr' in sys.argv[1]:
190 191 192 193 194 195 196 197
        gen = header_mixcr_results(sys.argv[1])
    else:
        gen = header_vquest_results(sys.argv[1], sys.argv[2])

    # output .vdj data
    for (header, result) in gen:
        print "#%s" % header
        print ">%s" % result
198
        print