repseq_vdj.py 10.4 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
5
  - or a results file produced by MiXCR or IgReC
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11
python repseq_vdj.py bla.igrec.results
12 13
python repseq_vdj.py data-curated/curated_IG.fa data-curated/igblast/IG/*.aln > data-curated/igblast-IG.vdj > data-curated/igblast-IG.vdj
python repseq_vdj.py data-curated/curated_TR.fa data-curated/igblast/TR/*.aln > data-curated/igblast-TR.vdj > data-curated/igblast-TR.vdj
14 15 16
'''


17 18
import sys
import os
19

20 21 22
V = 'V'
D = 'D'
J = 'J'
23

24 25 26
N1 = 'N1'
N2 = 'N2'
N = 'N'
27

28
JUNCTION = 'JUNCTION'
29

30

31 32
class VDJ_Formatter():
    '''Stores fields and outputs a .vdj line'''
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    def genes_to_vdj(self, genes):
        if not genes:
            return ''

        if len(genes) == 1:
            return genes[0]

        return '(%s)' % ','.join(genes)


    def N_to_vdj(self, s):
        return '/%s/' % s

    def CDR3_to_vdj(self, s):
        return '{%s}' % s if s else ''

    def to_vdj(self):
        if not self.result:
            return 'no result'

        s = ''
        s += self.genes_to_vdj(self.vdj[V])
        s += ' '

        if D in self.vdj:
            if N1 in self.vdj:
                s += self.N_to_vdj(self.vdj[N1])
                s += ' '
            s += self.genes_to_vdj(self.vdj[D])
            if N2 in self.vdj:
                s += ' '
                s += self.N_to_vdj(self.vdj[N2])
        else:
            if N in self.vdj:
                s += self.N_to_vdj(self.vdj[N])

        s += ' '
        s += self.genes_to_vdj(self.vdj[J])

        if JUNCTION in self.vdj:
            s += ' '
            s += self.CDR3_to_vdj(self.vdj[JUNCTION])

        return s


class Result(VDJ_Formatter):
81
    '''Stores a tabulated result'''
82 83 84 85

    def __init__(self, l):

        self.d = {}
86
        self.vdj = {}
87
        self.result = self.parse(l)
88

89
        if self.result:
90 91 92
            for i, data in enumerate(self.result.split('\t')):
                self.d[self.labels[i]] = data

93 94
            self.populate()

95 96 97
    def __contains__ (self, key):
        return key in self.d
    
98 99 100
    def __getitem__(self, key):
        return self.d[key]

101 102 103 104
    def __str__(self):
        return str(self.d)


105 106 107
### IgReC

IGREC_LABELS = [
108
  'Read id', 'locus',
109 110 111 112 113 114 115
  'V id', 'V start', 'V end', 'V score',
  'J id', 'J start', 'J end', 'J score',
]

class IgReC_Result(Result):

    r'''
116
    >>> lig = '\t'.join(['blabli4577', 'TRB', 'TRBV13*02', '1', '164', '0.58156', 'TRBJ1-5*01', '319', '367', '0.94'])
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
    >>> r = IgReC_Result(lig)

    >>> r['Read id']
    'blabli4577'
    >>> r.vdj[V]
    ['TRBV13*02']
    >>> r.vdj[J]
    ['TRBJ1-5*01']
    '''

    def parse(self, l):
        self.labels = IGREC_LABELS
        if ('\t' in l.strip()):
            return l
        else:
            return None

    def populate(self):
        self.vdj[V] = [self['V id']]
        self.vdj[J] = [self['J id']]


def header_igrec_results(ff_igrec):

    f = open(ff_igrec).__iter__()

    while True:
        l = f.next()
        result = IgReC_Result(l)
146
        yield result['Read id'].replace('_', ' '), result.to_vdj()
147

148 149 150

### MiXCR

151 152 153 154
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
155 156 157 158
        if ('\t' in l.strip()):
            return l
        else:
            return None
159

160
    def populate(self):
161 162 163 164
        self.vdj[V] = [self['bestVHit']]
        if self['bestDHit']:
            self.vdj[D] = [self['bestDHit']]
        self.vdj[J] = [self['bestJHit']]
165

166 167 168 169 170 171
        if 'nSeqVDJunction' in self:
            self.vdj[N1] = self['nSeqVDJunction']
        if 'nSeqDJJunction' in self:
            self.vdj[N2] = self['nSeqDJJunction']
        if 'nSeqVJJunction' in self:
            self.vdj[N] = self['nSeqVJJunction']
172

173 174
        if 'aaSeqCDR3' in self:
            self.vdj[JUNCTION] = self['aaSeqCDR3']
175

176

177 178 179 180 181
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
182
    globals()['mixcr_labels'] = mixcr_first_line.rstrip().split('\t')
183 184

    while True:
185
        l = f.next().rstrip()
186
        result = MiXCR_Result(l)
187
        yield result['descrsR1'], result.to_vdj()
188 189 190 191



### IMGT/V-QUEST
192 193 194 195 196 197

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
198 199 200 201
        if ('No result' not in l):
            return l
        else:
            return None
202

203 204 205 206 207 208 209 210 211 212 213 214 215
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

216 217 218 219
    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self['V-GENE and allele'])
        self.vdj[D] = self.parse_gene_and_allele(self['D-GENE and allele'])
        self.vdj[J] = self.parse_gene_and_allele(self['J-GENE and allele'])
220

221
        self.vdj[JUNCTION] = self['AA JUNCTION']
222

223

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
245
        r = IMGT_VQUEST_Result(vquest)
246
        yield (fasta.replace('>', ''), r.to_vdj())
247

248

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
### igBlast

igblast_labels = [V, D, J, "Chain", None, None, None, None, None, None]
igblast_VJ_labels = [V, J, "Chain", None, None, None, None, None, None]

class igBlast_Result(Result):
    '''Stores a igBlast result (.aln)'''

    def parse(self, ll):
        self.labels = igblast_labels

        go = False
        for l in ll:
            if "V-(D)-J rearrangement summary" in l:
                if "Top V gene match, Top J gene match" in l:
                    self.labels = igblast_VJ_labels
                go = True
                continue
            if go:
                return l

        return None

    def parse_gene_and_allele(self, s):
        if s == 'N/A':
            return []
        return s.split(',')

    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self[V])
        if D in self.d:
            self.vdj[D] = self.parse_gene_and_allele(self[D])
        self.vdj[J] = self.parse_gene_and_allele(self[J])



def header_igblast_results(ff_fasta, ff_igblast):

    f_fasta = open(ff_fasta).__iter__()

    for f in ff_igblast:
        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        igblast = open(f).readlines()

        r = igBlast_Result(igblast)
        yield (fasta.replace('>', ''), r.to_vdj())

300 301 302

### Vidjil

303

304
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c designations -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
305
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
306 307 308

def should_results_from_vidjil_output(f_log):
    '''
309
    Yield (should, result) couples from a Vidjil-algo output including lines in the form of:
310 311 312 313 314 315 316 317 318 319 320
    >TRDD2*01_1/AGG/1_TRDD3*01__TRD+ + VJ 	0 84 88 187	TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    or
    >TRDV3*01_0//0_TRDJ4*01 ! + VJ	0 49 50 97       TRD UNSEG noisy
    '''

    for l in open(f_log):
        if not l:
            continue
        if l[0] == '>':
            l = l.strip()
            pos = l.find(' + ') if ' + ' in l else l.find(' - ')
321 322 323 324
            if pos == -1:
                pos = l.find(' ! ')
            if pos == -1:
                raise ValueError("No [+-!] in the line: {}".format(l))
325 326 327
            should = l[1:pos].replace('_', ' ')

            pos = l.find('\t')
328 329
            if pos == -1:
                raise ValueError("I expected a tabulation to separate the sequence name from the remainder")
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
            result = l[pos+1:] + ' '

            yield (should, result)

def should_results_from_vidjil(program, f_should, f_log):
    '''
    Launch the program on f_should
    Yields (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    cmd = program % (f_should, f_log)

    with open(f_log, 'w') as f:
        f.write('# %s\n\n' % cmd)

    exit_code = os.system(cmd)
    if exit_code:
        print "Error. The program halted with exit code %s." % exit_code
        sys.exit(3)

    return should_results_from_vidjil_output(f_log)



356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399

### VDJ File

class VDJ_File():
    '''
    Handle .vdj files
    These files contain (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    def __init__(self):
        self.hr = []

    def __iter__(self):
        return self.hr.__iter__()

    def __len__(self):
        return len(self.hr)

    def write(self, f):
        for (header, result) in self.hr:
            f.write("#%s\n" % header)
            f.write(">%s\n" % result)
            f.write("\n")

    def parse_from_gen(self, gen):
        self.hr = list(gen)

    def parse_from_file(self, f):
        should = ''
        for l in f:

            l = l.strip()
            if not l:
                continue

            if l[0] == '#':
                should = l[1:]

            elif l[0] == '>':
                self.hr += [ (should, l[1:]) ]


400 401
### Main

402 403
if __name__ == '__main__':

404 405
    vdj = VDJ_File()

406
    if 'mixcr' in sys.argv[1]:
407
        vdj.parse_from_gen(header_mixcr_results(sys.argv[1]))
408 409
    elif 'igrec' in sys.argv[1]:
        vdj.parse_from_gen(header_igrec_results(sys.argv[1]))
410 411
    elif 'igblast' in sys.argv[2]:
        vdj.parse_from_gen(header_igblast_results(sys.argv[1], sys.argv[2:]))
412
    else:
413
        vdj.parse_from_gen(header_vquest_results(sys.argv[1], sys.argv[2]))
414

415
    vdj.write(sys.stdout)