fuse.py 23.6 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
8
9
10
11
12
13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
34
35
import json
import argparse
import time
import copy
36
import os.path
Marc Duez's avatar
Marc Duez committed
37
import datetime
38
import subprocess
39
from operator import itemgetter
40
from utils import *
41

42
VIDJIL_JSON_VERSION = "2014.10"
43
FUSE_VERSION = "vidjil fuse"
44

45
46
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

47
48
49
####

class Window:
50
    # Should be renamed "Clone"
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
74
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
75
        self.d["top"] = sys.maxsize
76
        self.d["reads"] = []
77
        for i in range(size):
78
            self.d["reads"].append(0)
79
80
81
        
        
    ### 
82
83
    def __iadd__(self, other):
        ### Not used now
84
        """Add other.reads to self.reads in-place, without extending lists"""
85

86
87
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
88
89
90

        return self
        
91
    def __add__(self, other):
92
        """Concat two windows, extending lists such as 'reads'"""
93
        #data we don't need to duplicate
94
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
95
96
        obj = Window(1)
        
97
98
99
100
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
101
102
103
104
105
106
107
108
109
110
111
112
113
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
114
115
116
117
118
    def latex(self, point=0, base=10000):
        reads = self.d["reads"][point]
        ratio = float(reads)/base
        return r"   &   & %7d & %5.2f%% & %-50s \\ %% %s" % (reads, ratio * 100, 
                                                           self.d["name"] if 'name' in self.d else self.d["id"], self.d["id"])
119

120
121
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
122
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
123
        
Marc Duez's avatar
Marc Duez committed
124
class Samples: 
125

Marc Duez's avatar
Marc Duez committed
126
127
    def __init__(self):
        self.d={}
128
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
129
130
131
132
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
133

134
135
136
137
138
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

139
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
140
141
        
        return obj
142

143
144
145
    def __str__(self):
        return "<Samples: %s>" % self.d

146
147
148
149
150
151
152
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
153
        self.d['distribution'] = {}
154
155
156

    def __add__(self, other):
        obj=Reads()
157
158
159
160
161

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
162
163
164
165
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
166
167
168
169
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
170
171
172

    def __str__(self):
        return "<Reads: %s>" % self.d
173
        
174
175
176
177
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

178
179
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
180
181
182
183
184
185
186
187
188
189
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

190
        for i, s in enumerate(window.d["reads"]):
191
192
193
194
195
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
196
197
            
        # print window, '-->', self.sizes
198
199
200
201
202
203
204
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
205
            w.d['reads'] = self.sizes[r]
206
207
208
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
209
            print('  --[others]-->', w)
210
            yield w
211
212
213
214
215
        
class ListWindows:
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
216
    <ListWindows: [25] 2>
217
218
219
220
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
221
    <ListWindows: [34] 2>
222
223
224
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
225
    >>> lw3 = lw1 + lw2
226
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
227
    <ListWindows: [25, 34] 3>
228
229
230
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
231
232
233
234
235
236
237
238
239
240
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

241
242
243
244
245
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
246
247
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
248
249
        self.d["clones"] = []
        self.d["clusters"] = []
250
        self.d["germlines"] = {}
251
        
252
253
254
255
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
256
    def __str__(self):
257
258
259
260
261
262
263
264
265
266
267
268
269
270
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
271
272
273

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
274
        print(self)
275
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
276
            print(clone)
277
278
279
280

    ### check vidjil_json_version
    def check_version(self, filepath):
        if "vidjil_json_version" in self.d:
281
            if self.d["vidjil_json_version"] <= VIDJIL_JSON_VERSION:
282
283
284
                return
        raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
        
285
286
    ### compute statistics about clones
    def build_stat(self):
287
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
288
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
289

290
291
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
292
                for r in range(len(ranges)):
293
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
294
295
296
297
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
298
299
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
300
301
302
303
            
        #TODO V/D/J distrib and more 
        
        
304
305
306
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
307
        print("==>", output)
308
309
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
310
311
312
313
314
315
316
317

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
318
319
320
321
322
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
323
324

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
325
            print("<==", file_path, "\t", end=' ')
326
        
327
        with open(file_path, "r") as file:
328
329
330
331
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
332
333
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
334
335
336
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
337
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
338
                print("[%s]" % f)
339
340
341

        else:
            f = file_path
342
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
343
                print()
344
        
Marc Duez's avatar
Marc Duez committed
345
        time = os.path.getmtime(file_path)
346
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
347

Marc Duez's avatar
Marc Duez committed
348
349
350
    def getTop(self, top):
        result = []
        
351
352
353
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
354
355
356
357
358
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
359
        reverseList = {}
360
361
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
362
363
364
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
365
        
Marc Duez's avatar
Marc Duez committed
366
        for i in filteredList:
367
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
368
        
369
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
370
        
371
372
373
374
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
375
376
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
377

378
379
380
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
381
382
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
383
        
384
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
385
386
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
387
        
388
389
        return obj
        
390
391
392
393
394
395
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
396
        self.d["clones"] += other.d["clones"]
397
398
399
400
401
402
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
403
    ### 
404
    def fuseWindows(self, w1, w2, l1, l2) :
405
        #store data in dict with "id" as key
406
407
        dico1 = {}
        for i in range(len(w1)) :
408
            dico1[ w1[i].d["id"] ] = w1[i]
409
410
411

        dico2 = {}
        for i in range(len(w2)) :
412
            dico2[ w2[i].d["id"] ] = w2[i]
413

414
        #concat data with the same key ("id")
415
416
417
418
419
420
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
421
                w=Window(l2)
422
423
424
425
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
426
                w=Window(l1)
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
444
445
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
446

447
        length = len(self)
448
        w=[]
449
450
451

        others = OtherWindows(nb_points)

452
453
454
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
455
            #else:
456
                #others += clone
457

458
        self.d["clones"] = w #+ list(others) 
459

Mathieu Giraud's avatar
Mathieu Giraud committed
460
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
461
462
        return self
        
463
    def load_clntab(self, file_path, *args, **kwargs):
464
465
466
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
467
        self.d["samples"].d["original_names"] = [file_path]
468
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
484
485
                w.d["seg"] = {}
                
486
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
487
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
488
489
                s = int(tab["sequence.size"])
                total_size += s
490
                w.d["reads"] = [ s ]
491

492
493
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
494
                w.d["sequence"] = tab["sequence.raw nt seq"]
495
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
496
                if (tab["sequence.D-GENE and allele"] != "") :
497
498
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
499
500
501
502

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
503
                
504
                if position >= 0:
505
506
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
507
                else:
508
509
510
511
512
513
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
514
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
515
                    
516
                listw.append((w , w.d["reads"][0]))
517
518

                raw_clonotype = tab.get("clonotype")
519
520
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
537
            self.d["clones"].append(listw[index][0])
538
        
539
540
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
541
542
543
544

        
    def toJson(self, obj):
        '''Serializer for json module'''
545
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
546
547
548
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
549
                
550
551
            return result
            raise TypeError(repr(obj) + " fail !") 
552
        
Marc Duez's avatar
Marc Duez committed
553
        else:
554
555
556
557
558
559
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
560
561
562

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
563
        if "samples" in obj_dict:
564
            obj = ListWindows()
565
            obj.d=obj_dict
566
567
            return obj

568
        if "id" in obj_dict:
569
            obj = Window(1)
570
            obj.d=obj_dict
571
            return obj
Marc Duez's avatar
Marc Duez committed
572
        
573
574
        if "total" in obj_dict:
            obj = Reads()
575
            obj.d=obj_dict
576
577
            return obj
            
Marc Duez's avatar
Marc Duez committed
578
579
        if "original_names" in obj_dict:
            obj = Samples()
580
            obj.d=obj_dict
581
            return obj
Marc Duez's avatar
Marc Duez committed
582
            
583
        return obj_dict
584
585
586
587
588
589
        



### some data used for test
w1 = Window(1)
590
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
591
w2 = Window(1)
592
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
593
w3 = Window(1)
594
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
595
w4 = Window(1)
596
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
597
598
599


w5 = Window(1)
600
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
601
w6 = Window(1)
602
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
603
604

lw1 = ListWindows()
605
lw1.d["timestamp"] = 'ts'
606
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
607
608
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
609
610

w7 = Window(1)
611
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
612
w8 = Window(1)
613
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
614
615

lw2 = ListWindows()
616
lw2.d["timestamp"] = 'ts'
617
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
618
619
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
620
621
622
623
624
625

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
626
    print("#", ' '.join(sys.argv))
627
628
629
630
631
632
633

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
634
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
635
636
637
638
639
640
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
641
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
642
    
643
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
644
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
645

Marc Duez's avatar
Marc Duez committed
646
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
647
648
649
650
651
652
653
654
655
656
657
658
659
660
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
661
662
    print("### fuse.py -- " + DESCRIPTION)
    print()
663

Marc Duez's avatar
Marc Duez committed
664
665
666
667
668
669
670
671
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
672
    if args.multi:
673
674
        for path_name in args.file:
            jlist = ListWindows()
675
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
676
            jlist.build_stat()
677
            
Mathieu Giraud's avatar
Mathieu Giraud committed
678
            print("\t", jlist, end=' ')
679
680
681
682
683

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
684
                
Mathieu Giraud's avatar
Mathieu Giraud committed
685
            print('\t==> merge to', jlist_fused)
686
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
687
        
688
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
689
        print("### Read and merge input files")
690
691
        for path_name in args.file:
            jlist = ListWindows()
692
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
693
694
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
695

696
697
698
699
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
700
            print("\t", jlist, end=' ')
701
702
703
704
705
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
706
            
Mathieu Giraud's avatar
Mathieu Giraud committed
707
            print('\t==> merge to', jlist_fused)
708

709
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
710
711
        print()
        print("### Select point names")
712
        l = jlist_fused.d["samples"].d["original_names"]
713
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
714
715
        print("  <==", l)
        print("  ==>", ll)
716
        jlist_fused.d["samples"].d["names"] = ll
717
    
Mathieu Giraud's avatar
Mathieu Giraud committed
718
    print()
719
    if not args.multi:
720
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
721
722
    print("\t", jlist_fused) 
    print()
723

724
725
726
727
728
729
730
731
732
    #compute similarity matrix
    fasta = ""
    for i in range(len(jlist_fused.d["clones"])) :
        fasta += ">>" + str(i) + "\n"
        fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
    fasta_file = open("tmp", 'w')
    fasta_file.write(fasta)
    jlist_fused.d["similarity"] = json.loads(subprocess.check_output(["../algo/tools/similarity", "-j", "tmp"]))
    
Mathieu Giraud's avatar
Mathieu Giraud committed
733
    print("### Save merged file")
734
735
736
737
738
739
    jlist_fused.save_json(args.output)

    
    
if  __name__ =='__main__':
    main()