repseq_vdj.py 8.88 KB
Newer Older
1
'''
2
Parses output of various RepSeq programs.
3 4 5
Takes either:
  - a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
  - or a results file produced by MiXCR
6 7
and creates a .vdj file to be checked by should-vdj-to-tap.py

8 9 10
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
11 12
python repseq_vdj.py data-curated/curated_IG.fa data-curated/igblast/IG/*.aln > data-curated/igblast-IG.vdj > data-curated/igblast-IG.vdj
python repseq_vdj.py data-curated/curated_TR.fa data-curated/igblast/TR/*.aln > data-curated/igblast-TR.vdj > data-curated/igblast-TR.vdj
13 14 15
'''


16 17
import sys
import os
18

19 20 21
V = 'V'
D = 'D'
J = 'J'
22

23 24 25
N1 = 'N1'
N2 = 'N2'
N = 'N'
26

27
JUNCTION = 'JUNCTION'
28

29

30 31
class VDJ_Formatter():
    '''Stores fields and outputs a .vdj line'''
32

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    def genes_to_vdj(self, genes):
        if not genes:
            return ''

        if len(genes) == 1:
            return genes[0]

        return '(%s)' % ','.join(genes)


    def N_to_vdj(self, s):
        return '/%s/' % s

    def CDR3_to_vdj(self, s):
        return '{%s}' % s if s else ''

    def to_vdj(self):
        if not self.result:
            return 'no result'

        s = ''
        s += self.genes_to_vdj(self.vdj[V])
        s += ' '

        if D in self.vdj:
            if N1 in self.vdj:
                s += self.N_to_vdj(self.vdj[N1])
                s += ' '
            s += self.genes_to_vdj(self.vdj[D])
            if N2 in self.vdj:
                s += ' '
                s += self.N_to_vdj(self.vdj[N2])
        else:
            if N in self.vdj:
                s += self.N_to_vdj(self.vdj[N])

        s += ' '
        s += self.genes_to_vdj(self.vdj[J])

        if JUNCTION in self.vdj:
            s += ' '
            s += self.CDR3_to_vdj(self.vdj[JUNCTION])

        return s


class Result(VDJ_Formatter):
80
    '''Stores a tabulated result'''
81 82 83 84

    def __init__(self, l):

        self.d = {}
85
        self.vdj = {}
86
        self.result = self.parse(l)
87

88
        if self.result:
89 90 91
            for i, data in enumerate(self.result.split('\t')):
                self.d[self.labels[i]] = data

92 93
            self.populate()

94 95 96
    def __getitem__(self, key):
        return self.d[key]

97 98 99 100
    def __str__(self):
        return str(self.d)


101 102 103

### MiXCR

104 105 106 107
class MiXCR_Result(Result):

    def parse(self, l):
        self.labels = mixcr_labels
108 109 110 111
        if ('\t' in l.strip()):
            return l
        else:
            return None
112

113 114
    def populate(self):
        self.vdj[V] = [self['Best V hit']]
115
        if self['Best D hit']:
116 117
            self.vdj[D] = [self['Best D hit']]
        self.vdj[J] = [self['Best J hit']]
118

119 120 121
        self.vdj[N1] = self['N. Seq. VDJunction']
        self.vdj[N2] = self['N. Seq. DJJunction']
        self.vdj[N] = self['N. Seq. VJJunction']
122

123
        self.vdj[JUNCTION] = self['AA. Seq. CDR3']
124

125

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
def header_mixcr_results(ff_mixcr):

    f = open(ff_mixcr).__iter__()

    mixcr_first_line = f.next()
    globals()['mixcr_labels'] = mixcr_first_line.split('\t')

    while True:
        l = f.next()
        result = MiXCR_Result(l)
        yield result['Description R1'], result.to_vdj()



### IMGT/V-QUEST
141 142 143 144 145 146

class IMGT_VQUEST_Result(Result):
    '''Stores a IMGT/V-QUEST result'''

    def parse(self, l):
        self.labels = vquest_labels
147 148 149 150
        if ('No result' not in l):
            return l
        else:
            return None
151

152 153 154 155 156 157 158 159 160 161 162 163 164
    def parse_gene_and_allele(self, s):
        '''
        Parse IMGT/V-QUEST fields such as:
        Homsap IGHV3-30*03 F, or Homsap IGHV3-30*18 F or Homsap IGHV3-30-5*01 F'
        '''

        genes = []
        for term in s.replace(',', '').split():
            if term in ['Homsap', '[F]', '(F)', 'F', 'P', 'or', 'and', '(see', 'comment)', 'ORF', '[ORF]']:
                continue
            genes += [term]
        return genes

165 166 167 168
    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self['V-GENE and allele'])
        self.vdj[D] = self.parse_gene_and_allele(self['D-GENE and allele'])
        self.vdj[J] = self.parse_gene_and_allele(self['J-GENE and allele'])
169

170
        self.vdj[JUNCTION] = self['AA JUNCTION']
171

172

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
def header_vquest_results(ff_fasta, ff_vquest):
    f_fasta = open(ff_fasta).__iter__()
    f_vquest = open(ff_vquest).__iter__()

    vquest_first_line = f_vquest.next()
    globals()['vquest_labels'] = vquest_first_line.split('\t')

    # print vquest_labels

    while True:

        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        vquest = ''
        # Advance until non-empty line
        while not vquest:
            vquest = f_vquest.next().strip()

Mathieu Giraud's avatar
Mathieu Giraud committed
194
        r = IMGT_VQUEST_Result(vquest)
195
        yield (fasta.replace('>', ''), r.to_vdj())
196

197

198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
### igBlast

igblast_labels = [V, D, J, "Chain", None, None, None, None, None, None]
igblast_VJ_labels = [V, J, "Chain", None, None, None, None, None, None]

class igBlast_Result(Result):
    '''Stores a igBlast result (.aln)'''

    def parse(self, ll):
        self.labels = igblast_labels

        go = False
        for l in ll:
            if "V-(D)-J rearrangement summary" in l:
                if "Top V gene match, Top J gene match" in l:
                    self.labels = igblast_VJ_labels
                go = True
                continue
            if go:
                return l

        return None

    def parse_gene_and_allele(self, s):
        if s == 'N/A':
            return []
        return s.split(',')

    def populate(self):
        self.vdj[V] = self.parse_gene_and_allele(self[V])
        if D in self.d:
            self.vdj[D] = self.parse_gene_and_allele(self[D])
        self.vdj[J] = self.parse_gene_and_allele(self[J])



def header_igblast_results(ff_fasta, ff_igblast):

    f_fasta = open(ff_fasta).__iter__()

    for f in ff_igblast:
        fasta = ''
        # Advance until header line
        while not '>' in fasta:
            fasta = f_fasta.next().strip()

        igblast = open(f).readlines()

        r = igBlast_Result(igblast)
        yield (fasta.replace('>', ''), r.to_vdj())

249 250 251

### Vidjil

252 253
VIDJIL_FINE = '{directory}/vidjil-algo -X 100 --header-sep "#" -c segment -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
254 255 256

def should_results_from_vidjil_output(f_log):
    '''
257
    Yield (should, result) couples from a Vidjil-algo output including lines in the form of:
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
    >TRDD2*01_1/AGG/1_TRDD3*01__TRD+ + VJ 	0 84 88 187	TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    or
    >TRDV3*01_0//0_TRDJ4*01 ! + VJ	0 49 50 97       TRD UNSEG noisy
    '''

    for l in open(f_log):
        if not l:
            continue
        if l[0] == '>':
            l = l.strip()
            pos = l.find(' + ') if ' + ' in l else l.find(' - ')
            should = l[1:pos].replace('_', ' ')

            pos = l.find('\t')
            result = l[pos+1:] + ' '

            yield (should, result)

def should_results_from_vidjil(program, f_should, f_log):
    '''
    Launch the program on f_should
    Yields (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    cmd = program % (f_should, f_log)

    with open(f_log, 'w') as f:
        f.write('# %s\n\n' % cmd)

    exit_code = os.system(cmd)
    if exit_code:
        print "Error. The program halted with exit code %s." % exit_code
        sys.exit(3)

    return should_results_from_vidjil_output(f_log)



298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

### VDJ File

class VDJ_File():
    '''
    Handle .vdj files
    These files contain (#, >) couples of V(D)J designations, such as in:
    #TRDD2*01 1/AGG/1 TRDD3*01  TRD+
    >TRDD2*01  TRDD3*01
    '''

    def __init__(self):
        self.hr = []

    def __iter__(self):
        return self.hr.__iter__()

    def __len__(self):
        return len(self.hr)

    def write(self, f):
        for (header, result) in self.hr:
            f.write("#%s\n" % header)
            f.write(">%s\n" % result)
            f.write("\n")

    def parse_from_gen(self, gen):
        self.hr = list(gen)

    def parse_from_file(self, f):
        should = ''
        for l in f:

            l = l.strip()
            if not l:
                continue

            if l[0] == '#':
                should = l[1:]

            elif l[0] == '>':
                self.hr += [ (should, l[1:]) ]


342 343
### Main

344 345
if __name__ == '__main__':

346 347
    vdj = VDJ_File()

348
    if 'mixcr' in sys.argv[1]:
349
        vdj.parse_from_gen(header_mixcr_results(sys.argv[1]))
350 351
    elif 'igblast' in sys.argv[2]:
        vdj.parse_from_gen(header_igblast_results(sys.argv[1], sys.argv[2:]))
352
    else:
353
        vdj.parse_from_gen(header_vquest_results(sys.argv[1], sys.argv[2]))
354

355
    vdj.write(sys.stdout)