repseq_vdj.py 7.14 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4 5
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
  - or a results file produced by MiXCR
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11 12 13
'''


14 15
import sys
import os
16

17 18 19
V = 'V'
D = 'D'
J = 'J'
20

21 22 23
N1 = 'N1'
N2 = 'N2'
N = 'N'
24

25
JUNCTION = 'JUNCTION'
26

27

28 29
class VDJ_Formatter():
    '''Stores fields and outputs a .vdj line'''
30

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    def genes_to_vdj(self, genes):
        if not genes:
            return ''

        if len(genes) == 1:
            return genes[0]

        return '(%s)' % ','.join(genes)


    def N_to_vdj(self, s):
        return '/%s/' % s

    def CDR3_to_vdj(self, s):
        return '{%s}' % s if s else ''

    def to_vdj(self):
        if not self.result:
            return 'no result'

        s = ''
        s += self.genes_to_vdj(self.vdj[V])
        s += ' '

        if D in self.vdj:
            if N1 in self.vdj:
                s += self.N_to_vdj(self.vdj[N1])
                s += ' '
            s += self.genes_to_vdj(self.vdj[D])
            if N2 in self.vdj:
                s += ' '
                s += self.N_to_vdj(self.vdj[N2])
        else:
            if N in self.vdj:
                s += self.N_to_vdj(self.vdj[N])

        s += ' '
        s += self.genes_to_vdj(self.vdj[J])

        if JUNCTION in self.vdj:
            s += ' '
            s += self.CDR3_to_vdj(self.vdj[JUNCTION])

        return s


class Result(VDJ_Formatter):
78
    '''Stores a tabulated result'''
79 80 81 82

    def __init__(self, l):

        self.d = {}
83
        self.vdj = {}
84
        self.result = self.parse(l)
85

86
        if self.result:
87 88 89
            for i, data in enumerate(self.result.split('\t')):
                self.d[self.labels[i]] = data

90 91
            self.populate()

92 93 94
    def __getitem__(self, key):
        return self.d[key]

95 96 97 98
    def __str__(self):
        return str(self.d)


99 100 101

### MiXCR

102 103 104 105
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
106 107 108 109
        if ('\t' in l.strip()):
            return l
        else:
            return None
110

111 112
    def populate(self):
        self.vdj[V] = [self['Best V hit']]
113
        if self['Best D hit']:
114 115
            self.vdj[D] = [self['Best D hit']]
        self.vdj[J] = [self['Best J hit']]
116

117 118 119
        self.vdj[N1] = self['N. Seq. VDJunction']
        self.vdj[N2] = self['N. Seq. DJJunction']
        self.vdj[N] = self['N. Seq. VJJunction']
120

121
        self.vdj[JUNCTION] = self['AA. Seq. CDR3']
122

123

124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
    globals()['mixcr_labels'] = mixcr_first_line.split('\t')

    while True:
        l = f.next()
        result = MiXCR_Result(l)
        yield result['Description R1'], result.to_vdj()



### IMGT/V-QUEST
139 140 141 142 143 144

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
145 146 147 148
        if ('No result' not in l):
            return l
        else:
            return None
149

150 151 152 153 154 155 156 157 158 159 160 161 162
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

163 164 165 166
    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self['V-GENE and allele'])
        self.vdj[D] = self.parse_gene_and_allele(self['D-GENE and allele'])
        self.vdj[J] = self.parse_gene_and_allele(self['J-GENE and allele'])
167

168
        self.vdj[JUNCTION] = self['AA JUNCTION']
169

170

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
192
        r = IMGT_VQUEST_Result(vquest)
193
        yield (fasta.replace('>', ''), r.to_vdj())
194

195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244


### Vidjil

VIDJIL_FINE = '{directory}/vidjil -X 100 -# "#" -c segment -i -3 -d -g {directory}/germline %s >> %s'
VIDJIL_KMER = '{directory}/vidjil -w 20 -# "#" -b out -c windows -uuuU -2 -i -g {directory}/germline %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'

def should_results_from_vidjil_output(f_log):
    '''
    Yield (should, result) couples from a Vidjil output including lines in the form of:
    >TRDD2*01_1/AGG/1_TRDD3*01__TRD+ + VJ 	0 84 88 187	TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    or
    >TRDV3*01_0//0_TRDJ4*01 ! + VJ	0 49 50 97       TRD UNSEG noisy
    '''

    for l in open(f_log):
        if not l:
            continue
        if l[0] == '>':
            l = l.strip()
            pos = l.find(' + ') if ' + ' in l else l.find(' - ')
            should = l[1:pos].replace('_', ' ')

            pos = l.find('\t')
            result = l[pos+1:] + ' '

            yield (should, result)

def should_results_from_vidjil(program, f_should, f_log):
    '''
    Launch the program on f_should
    Yields (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    cmd = program % (f_should, f_log)

    with open(f_log, 'w') as f:
        f.write('# %s\n\n' % cmd)

    exit_code = os.system(cmd)
    if exit_code:
        print "Error. The program halted with exit code %s." % exit_code
        sys.exit(3)

    return should_results_from_vidjil_output(f_log)



245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

### VDJ File

class VDJ_File():
    '''
    Handle .vdj files
    These files contain (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    def __init__(self):
        self.hr = []

    def __iter__(self):
        return self.hr.__iter__()

    def __len__(self):
        return len(self.hr)

    def write(self, f):
        for (header, result) in self.hr:
            f.write("#%s\n" % header)
            f.write(">%s\n" % result)
            f.write("\n")

    def parse_from_gen(self, gen):
        self.hr = list(gen)

    def parse_from_file(self, f):
        should = ''
        for l in f:

            l = l.strip()
            if not l:
                continue

            if l[0] == '#':
                should = l[1:]

            elif l[0] == '>':
                self.hr += [ (should, l[1:]) ]


289 290
### Main

291 292
if __name__ == '__main__':

293 294
    vdj = VDJ_File()

295
    if 'mixcr' in sys.argv[1]:
296
        vdj.parse_from_gen(header_mixcr_results(sys.argv[1]))
297
    else:
298
        vdj.parse_from_gen(header_vquest_results(sys.argv[1], sys.argv[2]))
299

300
    vdj.write(sys.stdout)