fuse.py 24.5 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
8
9
10
11
12
13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
34
35
import json
import argparse
import time
import copy
36
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
37
import os
Marc Duez's avatar
Marc Duez committed
38
import datetime
39
import subprocess
40
import tempfile
41
from operator import itemgetter
42
from utils import *
43
from defs import *
44
from collections import defaultdict
45

46
FUSE_VERSION = "vidjil fuse"
47

48
TOOL_SIMILARITY = "../algo/tools/similarity"
49
SIMILARITY_LIMIT = 1000
50

51
52
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

53
54
55
####

class Window:
56
    # Should be renamed "Clone"
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
80
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
81
        self.d["top"] = sys.maxsize
82
        self.d["reads"] = []
83
        for i in range(size):
84
            self.d["reads"].append(0)
85
86
87
        
        
    ### 
88
89
    def __iadd__(self, other):
        ### Not used now
90
        """Add other.reads to self.reads in-place, without extending lists"""
91

92
93
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
94
95
96

        return self
        
97
    def __add__(self, other):
98
        """Concat two windows, extending lists such as 'reads'"""
99
        #data we don't need to duplicate
100
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
101
102
        obj = Window(1)
        
103
104
105
106
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
107
108
109
110
111
112
113
114
115
116
117
118
119
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
120
121
122
    def get_nb_reads(self, id, point=0):
        return self[id]["reads"][point]

123
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
124
        reads = self.d["reads"][point]
125
        ratio_germline = float(reads)/base_germline
126
        ratio = float(reads)/base
127
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
128
129
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
130

131
132
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
133
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
134
        
Marc Duez's avatar
Marc Duez committed
135
class Samples: 
136

Marc Duez's avatar
Marc Duez committed
137
138
    def __init__(self):
        self.d={}
139
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
140
141
142
143
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
144

145
146
147
148
149
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

150
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
151
152
        
        return obj
153

154
155
156
    def __str__(self):
        return "<Samples: %s>" % self.d

157
158
159
160
161
162
163
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
164
        self.d['distribution'] = {}
165
166
167

    def __add__(self, other):
        obj=Reads()
168
169
170
171
172

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
173
174
175
176
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
177
178
179
180
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
181
182
183

    def __str__(self):
        return "<Reads: %s>" % self.d
184
        
185
186
187
188
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

189
190
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
191
192
193
194
195
196
197
198
199
200
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

201
        for i, s in enumerate(window.d["reads"]):
202
203
204
205
206
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
207
208
            
        # print window, '-->', self.sizes
209
210
211
212
213
214
215
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
216
            w.d['reads'] = self.sizes[r]
217
218
219
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
220
            print('  --[others]-->', w)
221
            yield w
222
        
223
class ListWindows(VidjilJson):
224
225
226
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
227
    <ListWindows: [25] 2>
228
229
230
231
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
232
    <ListWindows: [34] 2>
233
234
235
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
236
    >>> lw3 = lw1 + lw2
237
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
238
    <ListWindows: [25, 34] 3>
239
240
241
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
242
243
244
245
246
247
248
249
250
251
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

252
253
254
255
256
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
257
258
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
259
260
        self.d["clones"] = []
        self.d["clusters"] = []
261
        self.d["germlines"] = {}
262
        
263
264
265
266
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
267
    def __str__(self):
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
282
283
284

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
285
        print(self)
286
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
287
            print(clone)
288
        
289
290
    ### compute statistics about clones
    def build_stat(self):
291
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
292
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
293

294
295
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
296
                for r in range(len(ranges)):
297
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
298
299
300
301
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
302
303
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
304
305
306
307
            
        #TODO V/D/J distrib and more 
        
        
308
309
310
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
311
        print("==>", output)
312
313
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
314
315
316
317
318
319
320
321

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
322
323
324
325
326
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
327
328

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
329
            print("<==", file_path, "\t", end=' ')
330
        
331
        with open(file_path, "r") as file:
332
333
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
334
335
336
                # Be robust against 'null' values for clones
                if not self.d["clones"]:
                    self.d["clones"] = []
337
338
                self.check_version(file_path)
        
339
340
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
341
342
343
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
344
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
345
                print("[%s]" % f)
346
347
348

        else:
            f = file_path
349
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
350
                print()
351
        
Marc Duez's avatar
Marc Duez committed
352
        time = os.path.getmtime(file_path)
353
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
354

355
356
357
358
359
360
361
362
363
364
365
        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

Marc Duez's avatar
Marc Duez committed
366
367
368
    def getTop(self, top):
        result = []
        
369
370
371
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
372
373
374
375
376
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
377
        reverseList = {}
378
379
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
380
381
382
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
383
        
Marc Duez's avatar
Marc Duez committed
384
        for i in filteredList:
385
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
386
        
387
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
388
        
389
390
391
392
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
393
394
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
395

396
397
398
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
399
400
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
401
        
402
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
403
404
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
405
        
406
407
        return obj
        
408
409
410
411
412
413
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
414
        self.d["clones"] += other.d["clones"]
415
416
417
418
419
420
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
421
    ### 
422
    def fuseWindows(self, w1, w2, l1, l2) :
423
        #store data in dict with "id" as key
424
425
        dico1 = {}
        for i in range(len(w1)) :
426
            dico1[ w1[i].d["id"] ] = w1[i]
427
428
429

        dico2 = {}
        for i in range(len(w2)) :
430
            dico2[ w2[i].d["id"] ] = w2[i]
431

432
        #concat data with the same key ("id")
433
434
435
436
437
438
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
439
                w=Window(l2)
440
441
442
443
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
444
                w=Window(l1)
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
462
463
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
464

465
        length = len(self)
466
        w=[]
467
468
469

        others = OtherWindows(nb_points)

470
471
472
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
473
            #else:
474
                #others += clone
475

476
        self.d["clones"] = w #+ list(others) 
477

Mathieu Giraud's avatar
Mathieu Giraud committed
478
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
479
480
        return self
        
481
    def load_clntab(self, file_path, *args, **kwargs):
482
483
484
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
485
        self.d["samples"].d["original_names"] = [file_path]
486
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
502
503
                w.d["seg"] = {}
                
504
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
505
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
506
507
                s = int(tab["sequence.size"])
                total_size += s
508
                w.d["reads"] = [ s ]
509

510
511
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
512
                w.d["sequence"] = tab["sequence.raw nt seq"]
513
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
514
                if (tab["sequence.D-GENE and allele"] != "") :
515
516
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
517
518
519
520

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
521
                
522
                if position >= 0:
523
524
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
525
                else:
526
527
528
529
530
531
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
532
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
533
                    
534
                listw.append((w , w.d["reads"][0]))
535
536

                raw_clonotype = tab.get("clonotype")
537
538
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
555
            self.d["clones"].append(listw[index][0])
556
        
557
558
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
559
560
561
562

        
    def toJson(self, obj):
        '''Serializer for json module'''
563
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
564
565
566
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
567
                
568
569
            return result
            raise TypeError(repr(obj) + " fail !") 
570
        
Marc Duez's avatar
Marc Duez committed
571
        else:
572
573
574
575
576
577
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
578
579
580

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
581
        if "samples" in obj_dict:
582
            obj = ListWindows()
583
            obj.d=obj_dict
584
585
            return obj

586
        if "id" in obj_dict:
587
            obj = Window(1)
588
            obj.d=obj_dict
589
            return obj
Marc Duez's avatar
Marc Duez committed
590
        
591
592
        if "total" in obj_dict:
            obj = Reads()
593
            obj.d=obj_dict
594
595
            return obj
            
Marc Duez's avatar
Marc Duez committed
596
597
        if "original_names" in obj_dict:
            obj = Samples()
598
            obj.d=obj_dict
599
            return obj
Marc Duez's avatar
Marc Duez committed
600
            
601
        return obj_dict
602
603
604
605
606
607
        



### some data used for test
w1 = Window(1)
608
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
609
w2 = Window(1)
610
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
611
w3 = Window(1)
612
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
613
w4 = Window(1)
614
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
615
616
617


w5 = Window(1)
618
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
619
w6 = Window(1)
620
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
621
622

lw1 = ListWindows()
623
lw1.d["timestamp"] = 'ts'
624
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
625
626
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
627
628

w7 = Window(1)
629
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
630
w8 = Window(1)
631
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
632
633

lw2 = ListWindows()
634
lw2.d["timestamp"] = 'ts'
635
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
636
637
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
638
639
640
641
642
643

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
644
    print("#", ' '.join(sys.argv))
645
646
647
648
649
650
651

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
652
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
653
654
655
656
657
658
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
659
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
660
    
661
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
662
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
663

Marc Duez's avatar
Marc Duez committed
664
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
665
666
667
668
669
670
671
672
673
674
675
676
677
678
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
679
680
    print("### fuse.py -- " + DESCRIPTION)
    print()
681

Marc Duez's avatar
Marc Duez committed
682
683
684
685
686
687
688
689
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
690
    if args.multi:
691
692
        for path_name in args.file:
            jlist = ListWindows()
693
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
694
            jlist.build_stat()
695
            
Mathieu Giraud's avatar
Mathieu Giraud committed
696
            print("\t", jlist, end=' ')
697
698
699
700
701

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
702
                
Mathieu Giraud's avatar
Mathieu Giraud committed
703
            print('\t==> merge to', jlist_fused)
704
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
705
        
706
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
707
        print("### Read and merge input files")
708
709
        for path_name in args.file:
            jlist = ListWindows()
710
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
711
712
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
713

714
715
716
717
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
718
            print("\t", jlist, end=' ')
719
720
721
722
723
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
724
            
Mathieu Giraud's avatar
Mathieu Giraud committed
725
            print('\t==> merge to', jlist_fused)
726

727
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
728
729
        print()
        print("### Select point names")
730
        l = jlist_fused.d["samples"].d["original_names"]
731
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
732
733
        print("  <==", l)
        print("  ==>", ll)
734
        jlist_fused.d["samples"].d["names"] = ll
735
    
Mathieu Giraud's avatar
Mathieu Giraud committed
736
    print()
737
    if not args.multi:
738
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
739
740
    print("\t", jlist_fused) 
    print()
741

742
    #compute similarity matrix
743
744
745
746
747
748
749
750
751
752
753
754
755
756
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
        fasta_file = tempfile.NamedTemporaryFile(delete=False)
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
        
        
Mathieu Giraud's avatar
Mathieu Giraud committed
757
    print("### Save merged file")
758
    jlist_fused.save_json(args.output)
Mikaël Salson's avatar
Mikaël Salson committed
759
    os.unlink(fasta_file.name)
760
    
761
762
763
764
    
    
if  __name__ =='__main__':
    main()