repseq_vdj.py 4.73 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4 5
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
  - or a results file produced by MiXCR
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11 12 13 14 15
'''

import sys


16 17 18
V = 'V'
D = 'D'
J = 'J'
19

20 21 22
N1 = 'N1'
N2 = 'N2'
N = 'N'
23

24
JUNCTION = 'JUNCTION'
25

26

27 28
class VDJ_Formatter():
    '''Stores fields and outputs a .vdj line'''
29

30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    def genes_to_vdj(self, genes):
        if not genes:
            return ''

        if len(genes) == 1:
            return genes[0]

        return '(%s)' % ','.join(genes)


    def N_to_vdj(self, s):
        return '/%s/' % s

    def CDR3_to_vdj(self, s):
        return '{%s}' % s if s else ''

    def to_vdj(self):
        if not self.result:
            return 'no result'

        s = ''
        s += self.genes_to_vdj(self.vdj[V])
        s += ' '

        if D in self.vdj:
            if N1 in self.vdj:
                s += self.N_to_vdj(self.vdj[N1])
                s += ' '
            s += self.genes_to_vdj(self.vdj[D])
            if N2 in self.vdj:
                s += ' '
                s += self.N_to_vdj(self.vdj[N2])
        else:
            if N in self.vdj:
                s += self.N_to_vdj(self.vdj[N])

        s += ' '
        s += self.genes_to_vdj(self.vdj[J])

        if JUNCTION in self.vdj:
            s += ' '
            s += self.CDR3_to_vdj(self.vdj[JUNCTION])

        return s


class Result(VDJ_Formatter):
77
    '''Stores a tabulated result'''
78 79 80 81

    def __init__(self, l):

        self.d = {}
82
        self.vdj = {}
83
        self.result = self.parse(l)
84 85

        for i, data in enumerate(l.split('\t')):
86
            self.d[self.labels[i]] = data
87

88 89 90
        if self.result:
            self.populate()

91 92 93
    def __getitem__(self, key):
        return self.d[key]

94 95 96 97
    def __str__(self):
        return str(self.d)


98 99 100

### MiXCR

101 102 103 104 105 106
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
        return ('\t' in l.strip())

107 108
    def populate(self):
        self.vdj[V] = [self['Best V hit']]
109
        if self['Best D hit']:
110 111
            self.vdj[D] = [self['Best D hit']]
        self.vdj[J] = [self['Best J hit']]
112

113 114 115
        self.vdj[N1] = self['N. Seq. VDJunction']
        self.vdj[N2] = self['N. Seq. DJJunction']
        self.vdj[N] = self['N. Seq. VJJunction']
116

117
        self.vdj[JUNCTION] = self['AA. Seq. CDR3']
118

119

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
    globals()['mixcr_labels'] = mixcr_first_line.split('\t')

    while True:
        l = f.next()
        result = MiXCR_Result(l)
        yield result['Description R1'], result.to_vdj()



### IMGT/V-QUEST
135 136 137 138 139 140 141 142

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
        return ('No result' not in l)

143 144 145 146 147 148 149 150 151 152 153 154 155
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

156 157 158 159
    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self['V-GENE and allele'])
        self.vdj[D] = self.parse_gene_and_allele(self['D-GENE and allele'])
        self.vdj[J] = self.parse_gene_and_allele(self['J-GENE and allele'])
160

161
        self.vdj[JUNCTION] = self['AA JUNCTION']
162

163

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
185
        r = IMGT_VQUEST_Result(vquest)
186
        yield (fasta.replace('>', ''), r.to_vdj())
187 188 189

if __name__ == '__main__':

190
    if 'mixcr' in sys.argv[1]:
191 192 193 194 195 196 197 198
        gen = header_mixcr_results(sys.argv[1])
    else:
        gen = header_vquest_results(sys.argv[1], sys.argv[2])

    # output .vdj data
    for (header, result) in gen:
        print "#%s" % header
        print ">%s" % result
199
        print