fuse.py 26 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9
10
11
12
13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43

44
45
from vidjilparser import VidjilParser

46
FUSE_VERSION = "vidjil fuse"
47

48
TOOL_SIMILARITY = "../algo/tools/similarity"
49
SIMILARITY_LIMIT = 100
50

51
52
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

53
54
55
####

class Window:
56
    # Should be renamed "Clone"
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
80
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
81
        self.d["top"] = sys.maxsize
82
        self.d["reads"] = []
83
        for i in range(size):
84
            self.d["reads"].append(0)
85
86
87
        
        
    ### 
88
89
    def __iadd__(self, other):
        ### Not used now
90
        """Add other.reads to self.reads in-place, without extending lists"""
91

92
93
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
94
95
96

        return self
        
97
    def __add__(self, other):
98
        """Concat two windows, extending lists such as 'reads'"""
99
        #data we don't need to duplicate
100
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
101
102
        obj = Window(1)
        
103
104
105
106
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
107
108
                    
        #keep other data who don't need to be concat
109
        if other.d["top"] < self.d["top"] :
110
111
112
113
114
115
116
117
118
119
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
120
121
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
122

123
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
124
        reads = self.d["reads"][point]
125
        ratio_germline = float(reads)/base_germline
126
        ratio = float(reads)/base
127
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
128
129
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
130

131
132
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
133
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
134
        
Marc Duez's avatar
Marc Duez committed
135
class Samples: 
136

Marc Duez's avatar
Marc Duez committed
137
138
    def __init__(self):
        self.d={}
139
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
140
141
142
143
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
144

145
146
147
148
149
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

150
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
151
152
        
        return obj
153

154
155
156
    def __str__(self):
        return "<Samples: %s>" % self.d

157
158
class Diversity: 

159
160
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

161
162
163
    def __init__(self, data=None):
        self.d={}

164
165
166
167
168
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
169

170
171
172
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
173
174
175
176
177
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
178
179
180
181
182
183
184
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
185
        self.d['distribution'] = {}
186
187
188

    def __add__(self, other):
        obj=Reads()
189
190
191
192
193

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
194
195
196
197
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
198
199
200
201
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
202
203
204

    def __str__(self):
        return "<Reads: %s>" % self.d
205
        
206
207
208
209
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

210
211
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
212
213
214
215
216
217
218
219
220
221
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

222
        for i, s in enumerate(window.d["reads"]):
223
224
225
226
227
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
228
229
            
        # print window, '-->', self.sizes
230
231
232
233
234
235
236
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
237
            w.d['reads'] = self.sizes[r]
238
239
240
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
241
            print('  --[others]-->', w)
242
            yield w
243
        
244
class ListWindows(VidjilJson):
245
246
247
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
248
    <ListWindows: [25] 2>
249
250
251
252
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
253
    <ListWindows: [34] 2>
254
255
256
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
257
    >>> lw3 = lw1 + lw2
258
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
259
    <ListWindows: [25, 34] 3>
260
261
262
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
263
264
265
266
267
268
269
270
271
272
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

273
274
275
276
277
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
278
279
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
280
281
        self.d["clones"] = []
        self.d["clusters"] = []
282
        self.d["germlines"] = {}
283
        
284
285
286
287
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
288
    def __str__(self):
289
290
291
292
293
294
295
296
297
298
299
300
301
302
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
303
304
305

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
306
        print(self)
307
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
308
            print(clone)
309
        
310
311
    ### compute statistics about clones
    def build_stat(self):
312
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
313
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
314

315
316
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
317
                for r in range(len(ranges)):
318
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
319
320
321
322
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
323
324
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
325
326
327
328
            
        #TODO V/D/J distrib and more 
        
        
329
330
331
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
332
        print("==>", output)
333
334
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
335

336
337
    def load(self, string, *args, **kwargs):
        '''
338
339
340
341
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
342
343
        '''
        self.load_vidjil(string, *args, **kwargs)
344
        
345
    def load_vidjil(self, string, pipeline, verbose=True):
346
347
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
348
        
349
350
351
352
353
354
        tmp = json.loads(string, object_hook=self.toPython)
        self.d=tmp.d
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
        #self.check_version(file_path)
355

356
357
358
359
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
360
        
361
362
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
363
        '''
364
365
366
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
367
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
368
                print("[%s]" % f)
369
370
371

        else:
            f = file_path
372
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
373
                print()
374
        
Marc Duez's avatar
Marc Duez committed
375
        time = os.path.getmtime(file_path)
376
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
377
        '''
378

379
380
381
382
383
384
385
386
387
388
389
        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

Marc Duez's avatar
Marc Duez committed
390
391
392
    def getTop(self, top):
        result = []
        
393
394
395
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
396
397
398
399
400
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
401
        reverseList = {}
402
403
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
404
405
406
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
407
        
Marc Duez's avatar
Marc Duez committed
408
        for i in filteredList:
409
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
410
        
411
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
412
        
413
414
415
416
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
417
418
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
419

420
421
422
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
423
424
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
425
        
426
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
427
428
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
429
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
430
        
431
432
        return obj
        
433
434
435
436
437
438
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
439
        self.d["clones"] += other.d["clones"]
440
441
442
443
444
445
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
446
    ### 
447
    def fuseWindows(self, w1, w2, l1, l2) :
448
        #store data in dict with "id" as key
449
450
        dico1 = {}
        for i in range(len(w1)) :
451
            dico1[ w1[i].d["id"] ] = w1[i]
452
453
454

        dico2 = {}
        for i in range(len(w2)) :
455
            dico2[ w2[i].d["id"] ] = w2[i]
456

457
        #concat data with the same key ("id")
458
459
460
461
462
463
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
464
                w=Window(l2)
465
466
467
468
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
469
                w=Window(l1)
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
487
488
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
489
490

        w=[]
491
492
493

        others = OtherWindows(nb_points)

494
495
496
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
497
            #else:
498
                #others += clone
499

500
        self.d["clones"] = w #+ list(others) 
501

Mathieu Giraud's avatar
Mathieu Giraud committed
502
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
503
504
        return self
        
505
    def load_clntab(self, file_path, *args, **kwargs):
506
507
508
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
509
        self.d["samples"].d["original_names"] = [file_path]
510
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
526
527
                w.d["seg"] = {}
                
528
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
529
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
530
531
                s = int(tab["sequence.size"])
                total_size += s
532
                w.d["reads"] = [ s ]
533

534
535
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
536
                w.d["sequence"] = tab["sequence.raw nt seq"]
537
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
538
                if (tab["sequence.D-GENE and allele"] != "") :
539
540
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
541
542
543
544

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
545
                
546
                if position >= 0:
547
548
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
549
                else:
550
551
552
553
554
555
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
556
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
557
                    
558
                listw.append((w , w.d["reads"][0]))
559
560

                raw_clonotype = tab.get("clonotype")
561
562
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
579
            self.d["clones"].append(listw[index][0])
580
        
581
582
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
583
584
585
586

        
    def toJson(self, obj):
        '''Serializer for json module'''
587
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
588
589
590
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
591
                
592
593
            return result
            raise TypeError(repr(obj) + " fail !") 
594
        
Marc Duez's avatar
Marc Duez committed
595
        else:
596
597
598
599
600
601
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
602
603
604

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
605
        if "samples" in obj_dict:
606
            obj = ListWindows()
607
            obj.d=obj_dict
608
609
            return obj

610
        if "id" in obj_dict:
611
            obj = Window(1)
612
            obj.d=obj_dict
613
            return obj
Marc Duez's avatar
Marc Duez committed
614
        
615
616
        if "total" in obj_dict:
            obj = Reads()
617
            obj.d=obj_dict
618
619
            return obj
            
Marc Duez's avatar
Marc Duez committed
620
621
        if "original_names" in obj_dict:
            obj = Samples()
622
            obj.d=obj_dict
623
            return obj
Marc Duez's avatar
Marc Duez committed
624
            
625
        return obj_dict
626
627
628
629
630
631
        



### some data used for test
w1 = Window(1)
632
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
633
w2 = Window(1)
634
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
635
w3 = Window(1)
636
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
637
w4 = Window(1)
638
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
639
640
641


w5 = Window(1)
642
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
643
w6 = Window(1)
644
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
645
646

lw1 = ListWindows()
647
lw1.d["timestamp"] = 'ts'
648
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
649
650
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
651
652
lw1.d["diversity"] = Diversity()

653
654

w7 = Window(1)
655
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
656
w8 = Window(1)
657
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
658
659

lw2 = ListWindows()
660
lw2.d["timestamp"] = 'ts'
661
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
662
663
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
664
lw2.d["diversity"] = Diversity()
665
666
667
668
669
670

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
671
    print("#", ' '.join(sys.argv))
672
673
674
675
676
677
678

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
679
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
680
681
682
683
684
685
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
686
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
687
    
688
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
689
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
690

Marc Duez's avatar
Marc Duez committed
691
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
692
693
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

694
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
695

696
697
698
699
700
701
702
703
704
705
706
707
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
708
709
    print("### fuse.py -- " + DESCRIPTION)
    print()
710

711
712
713
714
715
716
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

Marc Duez's avatar
Marc Duez committed
717
718
    #filtre
    f = []
719
720
721

    vparser = VidjilParser()
    vparser.addPrefix('clones', 'clones.item.top', le, args.top)
722
    for path_name in files:
723
724
725
        json_clones = vparser.extract(path_name)
        clones = json.loads(json_clones)
        f += [c['id'] for c in clones["clones"]]
Marc Duez's avatar
Marc Duez committed
726
727
    f = sorted(set(f))
    
728
729
730
731
732
    vparser.reset()
    vparser.addPrefix('reads')
    vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)
    vparser.addPrefix('samples')
    vparser.addPrefix('vidjil_json_version')
733
    if args.multi:
734
        for path_name in files:
735
            json_reads = vparser.extract(path_name)
736
            jlist = ListWindows()
737
738
            jlist.load(json_reads, args.pipeline)
            #jlist.build_stat()
739
            
Mathieu Giraud's avatar
Mathieu Giraud committed
740
            print("\t", jlist, end=' ')
741
742
743
744
745

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
746
                
Mathieu Giraud's avatar
Mathieu Giraud committed
747
            print('\t==> merge to', jlist_fused)
748
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
749
        
750
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
751
        print("### Read and merge input files")
752
        for path_name in files:
753
            json_reads = vparser.extract(path_name)
754
            jlist = ListWindows()
755
756
757
            jlist.load(json_reads, args.pipeline)
            #jlist.build_stat()
            #jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
758

759
760
761
762
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
763
            print("\t", jlist, end=' ')
764
765
766
767
768
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
769
            
Mathieu Giraud's avatar
Mathieu Giraud committed
770
            print('\t==> merge to', jlist_fused)
771

772
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
773
774
        print()
        print("### Select point names")
775
        l = jlist_fused.d["samples"].d["original_names"]
776
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
777
778
        print("  <==", l)
        print("  ==>", ll)
779
        jlist_fused.d["samples"].d["names"] = ll
780
    
Mathieu Giraud's avatar
Mathieu Giraud committed
781
    print()
782
    if not args.multi:
783
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
784
785
    print("\t", jlist_fused) 
    print()
786

787
    #compute similarity matrix
788
789
790
791
792
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
Ryan Herbert's avatar
Ryan Herbert committed
793
        fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
794
795
796
797
798
799
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
800
801
        finally:
            os.unlink(fasta_file.name)
802
803
    else : 
        jlist_fused.d["similarity"] = [];
804
        
Mathieu Giraud's avatar
Mathieu Giraud committed
805
    print("### Save merged file")
806
    jlist_fused.save_json(args.output)
807
    
808
809
810
811
    
    
if  __name__ =='__main__':
    main()