fuse.py 24.8 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
8
9
10
11
12
13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43

44
FUSE_VERSION = "vidjil fuse"
45

46
TOOL_SIMILARITY = "../algo/tools/similarity"
47
SIMILARITY_LIMIT = 100
48

49
50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51
52
53
####

class Window:
54
    # Should be renamed "Clone"
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83
84
85
        
        
    ### 
86
87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90
91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92
93
94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99
100
        obj = Window(1)
        
101
102
103
104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105
106
                    
        #keep other data who don't need to be concat
107
        if other.d["top"] < self.d["top"] :
108
109
110
111
112
113
114
115
116
117
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118
119
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
120

121
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
122
        reads = self.d["reads"][point]
123
        ratio_germline = float(reads)/base_germline
124
        ratio = float(reads)/base
125
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
126
127
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
128

129
130
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
131
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
132
        
Marc Duez's avatar
Marc Duez committed
133
class Samples: 
134

Marc Duez's avatar
Marc Duez committed
135
136
    def __init__(self):
        self.d={}
137
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
138
139
140
141
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
142

143
144
145
146
147
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

148
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
149
150
        
        return obj
151

152
153
154
    def __str__(self):
        return "<Samples: %s>" % self.d

155
156
157
158
159
160
161
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
162
        self.d['distribution'] = {}
163
164
165

    def __add__(self, other):
        obj=Reads()
166
167
168
169
170

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
171
172
173
174
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
175
176
177
178
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
179
180
181

    def __str__(self):
        return "<Reads: %s>" % self.d
182
        
183
184
185
186
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

187
188
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
189
190
191
192
193
194
195
196
197
198
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

199
        for i, s in enumerate(window.d["reads"]):
200
201
202
203
204
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
205
206
            
        # print window, '-->', self.sizes
207
208
209
210
211
212
213
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
214
            w.d['reads'] = self.sizes[r]
215
216
217
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
218
            print('  --[others]-->', w)
219
            yield w
220
        
221
class ListWindows(VidjilJson):
222
223
224
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
225
    <ListWindows: [25] 2>
226
227
228
229
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
230
    <ListWindows: [34] 2>
231
232
233
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
234
    >>> lw3 = lw1 + lw2
235
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
236
    <ListWindows: [25, 34] 3>
237
238
239
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
240
241
242
243
244
245
246
247
248
249
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

250
251
252
253
254
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
255
256
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
257
258
        self.d["clones"] = []
        self.d["clusters"] = []
259
        self.d["germlines"] = {}
260
        
261
262
263
264
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
265
    def __str__(self):
266
267
268
269
270
271
272
273
274
275
276
277
278
279
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
280
281
282

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
283
        print(self)
284
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
285
            print(clone)
286
        
287
288
    ### compute statistics about clones
    def build_stat(self):
289
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
290
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
291

292
293
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
294
                for r in range(len(ranges)):
295
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
296
297
298
299
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
300
301
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
302
303
304
305
            
        #TODO V/D/J distrib and more 
        
        
306
307
308
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
309
        print("==>", output)
310
311
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
312
313
314
315
316
317
318
319

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
320
321
322
323
324
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
325
326

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
327
            print("<==", file_path, "\t", end=' ')
328
        
329
330
        with open(file_path, "r") as f:
                tmp = json.load(f, object_hook=self.toPython)
331
                self.d=tmp.d
332
333
334
                # Be robust against 'null' values for clones
                if not self.d["clones"]:
                    self.d["clones"] = []
335
336
                self.check_version(file_path)
        
337
338
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
339
340
341
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
342
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
343
                print("[%s]" % f)
344
345
346

        else:
            f = file_path
347
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
348
                print()
349
        
Marc Duez's avatar
Marc Duez committed
350
        time = os.path.getmtime(file_path)
351
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
352

353
354
355
356
357
358
359
360
361
362
363
        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

Marc Duez's avatar
Marc Duez committed
364
365
366
    def getTop(self, top):
        result = []
        
367
368
369
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
370
371
372
373
374
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
375
        reverseList = {}
376
377
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
378
379
380
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
381
        
Marc Duez's avatar
Marc Duez committed
382
        for i in filteredList:
383
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
384
        
385
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
386
        
387
388
389
390
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
391
392
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
393

394
395
396
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
397
398
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
399
        
400
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
401
402
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
403
        
404
405
        return obj
        
406
407
408
409
410
411
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
412
        self.d["clones"] += other.d["clones"]
413
414
415
416
417
418
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
419
    ### 
420
    def fuseWindows(self, w1, w2, l1, l2) :
421
        #store data in dict with "id" as key
422
423
        dico1 = {}
        for i in range(len(w1)) :
424
            dico1[ w1[i].d["id"] ] = w1[i]
425
426
427

        dico2 = {}
        for i in range(len(w2)) :
428
            dico2[ w2[i].d["id"] ] = w2[i]
429

430
        #concat data with the same key ("id")
431
432
433
434
435
436
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
437
                w=Window(l2)
438
439
440
441
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
442
                w=Window(l1)
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
460
461
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
462
463

        w=[]
464
465
466

        others = OtherWindows(nb_points)

467
468
469
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
470
            #else:
471
                #others += clone
472

473
        self.d["clones"] = w #+ list(others) 
474

Mathieu Giraud's avatar
Mathieu Giraud committed
475
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
476
477
        return self
        
478
    def load_clntab(self, file_path, *args, **kwargs):
479
480
481
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
482
        self.d["samples"].d["original_names"] = [file_path]
483
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
499
500
                w.d["seg"] = {}
                
501
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
502
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
503
504
                s = int(tab["sequence.size"])
                total_size += s
505
                w.d["reads"] = [ s ]
506

507
508
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
509
                w.d["sequence"] = tab["sequence.raw nt seq"]
510
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
511
                if (tab["sequence.D-GENE and allele"] != "") :
512
513
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
514
515
516
517

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
518
                
519
                if position >= 0:
520
521
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
522
                else:
523
524
525
526
527
528
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
529
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
530
                    
531
                listw.append((w , w.d["reads"][0]))
532
533

                raw_clonotype = tab.get("clonotype")
534
535
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
552
            self.d["clones"].append(listw[index][0])
553
        
554
555
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
556
557
558
559

        
    def toJson(self, obj):
        '''Serializer for json module'''
560
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
561
562
563
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
564
                
565
566
            return result
            raise TypeError(repr(obj) + " fail !") 
567
        
Marc Duez's avatar
Marc Duez committed
568
        else:
569
570
571
572
573
574
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
575
576
577

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
578
        if "samples" in obj_dict:
579
            obj = ListWindows()
580
            obj.d=obj_dict
581
582
            return obj

583
        if "id" in obj_dict:
584
            obj = Window(1)
585
            obj.d=obj_dict
586
            return obj
Marc Duez's avatar
Marc Duez committed
587
        
588
589
        if "total" in obj_dict:
            obj = Reads()
590
            obj.d=obj_dict
591
592
            return obj
            
Marc Duez's avatar
Marc Duez committed
593
594
        if "original_names" in obj_dict:
            obj = Samples()
595
            obj.d=obj_dict
596
            return obj
Marc Duez's avatar
Marc Duez committed
597
            
598
        return obj_dict
599
600
601
602
603
604
        



### some data used for test
w1 = Window(1)
605
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
606
w2 = Window(1)
607
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
608
w3 = Window(1)
609
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
610
w4 = Window(1)
611
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
612
613
614


w5 = Window(1)
615
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
616
w6 = Window(1)
617
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
618
619

lw1 = ListWindows()
620
lw1.d["timestamp"] = 'ts'
621
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
622
623
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
624
625

w7 = Window(1)
626
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
627
w8 = Window(1)
628
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
629
630

lw2 = ListWindows()
631
lw2.d["timestamp"] = 'ts'
632
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
633
634
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
635
636
637
638
639
640

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
641
    print("#", ' '.join(sys.argv))
642
643
644
645
646
647
648

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
649
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
650
651
652
653
654
655
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
656
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
657
    
658
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
659
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
660

Marc Duez's avatar
Marc Duez committed
661
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
662
663
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

664
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
665

666
667
668
669
670
671
672
673
674
675
676
677
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
678
679
    print("### fuse.py -- " + DESCRIPTION)
    print()
680

681
682
683
684
685
686
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

Marc Duez's avatar
Marc Duez committed
687
688
    #filtre
    f = []
689
    for path_name in files:
Marc Duez's avatar
Marc Duez committed
690
691
692
693
694
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
695
    if args.multi:
696
        for path_name in files:
697
            jlist = ListWindows()
698
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
699
            jlist.build_stat()
700
            
Mathieu Giraud's avatar
Mathieu Giraud committed
701
            print("\t", jlist, end=' ')
702
703
704
705
706

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
707
                
Mathieu Giraud's avatar
Mathieu Giraud committed
708
            print('\t==> merge to', jlist_fused)
709
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
710
        
711
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
712
        print("### Read and merge input files")
713
        for path_name in files:
714
            jlist = ListWindows()
715
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
716
717
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
718

719
720
721
722
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
723
            print("\t", jlist, end=' ')
724
725
726
727
728
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
729
            
Mathieu Giraud's avatar
Mathieu Giraud committed
730
            print('\t==> merge to', jlist_fused)
731

732
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
733
734
        print()
        print("### Select point names")
735
        l = jlist_fused.d["samples"].d["original_names"]
736
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
737
738
        print("  <==", l)
        print("  ==>", ll)
739
        jlist_fused.d["samples"].d["names"] = ll
740
    
Mathieu Giraud's avatar
Mathieu Giraud committed
741
    print()
742
    if not args.multi:
743
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
744
745
    print("\t", jlist_fused) 
    print()
746

747
    #compute similarity matrix
748
749
750
751
752
753
754
755
756
757
758
759
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
        fasta_file = tempfile.NamedTemporaryFile(delete=False)
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
760
761
        finally:
            os.unlink(fasta_file.name)
762
763
    else : 
        jlist_fused.d["similarity"] = [];
764
        
Mathieu Giraud's avatar
Mathieu Giraud committed
765
    print("### Save merged file")
766
    jlist_fused.save_json(args.output)
767
    
768
769
770
771
    
    
if  __name__ =='__main__':
    main()