fuse.py 27.7 KB
Newer Older
1
2
3
4
5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
8
9
10
11
12
13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

import collections
import json
import argparse
import sys
import time
import copy
34
import os.path
Marc Duez's avatar
Marc Duez committed
35
import datetime
36
37
38
from operator import itemgetter


39
VIDJIL_JSON_VERSION = "2014.10"
40
FUSE_VERSION = "vidjil fuse"
41

42
43
44
45
46
47
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

def ordered(d, key=None):
    '''sorts a dictionary into an OrderedDict'''
    return collections.OrderedDict([(k, d[k]) for k in sorted(d, key=key)])

48
49
50
51
52
53
54
55
56
57
58
####


def concatenate_with_padding(d, 
                             d1, d1_size, 
                             d2, d2_size,
                             ignore_keys=[]):
        '''Concatenate two dictionaries d1 and d2 into d
        The dictionaries d1 and d2 store several values that are lists with d1_size and d2_size elements,
        and the resulting dictionary will store values that are lists with size d1_size + d2_size elements.
        Pads with lists [0, ... 0] data that appear in either only d1 or only d2.
59
        The values that are not lists are ignored (but this should not happen).
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

        >>> d = {}
        >>> d1 = { 'a': [1, 2], 'b': [11, 22], 'z':17 }
        >>> d2 = { 'a': [3, 4, 5], 'c': [333, 444, 555] } 
        >>> concatenate_with_padding(d, d1, 2, d2, 5, ['z'])
        >>> d
        {'a': [1, 2, 3, 4, 5], 'c': [0, 0, 333, 444, 555], 'b': [11, 22, 0, 0, 0, 0, 0]}
        '''

        t1=[]
        t2=[]
        
        for i in range(d1_size):
            t1.append(0)
    
        for i in range(d2_size):
            t2.append(0)
    
        for key in d1:
            if key in ignore_keys:
                continue
81
82
83
            if type(d1[key]) is not list:
                continue

84
85
86
87
88
89
90
            d[key] = d1[key]
            if key not in d2 :
                d[key] += t2

        for key in d2:
            if key in ignore_keys:
                continue
91
92
93
            if type(d2[key]) is not list:
                continue

94
95
96
97
98
99
100
            if key not in d :
                d[key] = t1 + d2[key]
            else :
                d[key] = d[key] + d2[key]



101
102
103
####

class Window:
104
    # Should be renamed "Clone"
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
128
        self.d["id"] = ""
129
        self.d["top"] = sys.maxint
130
        self.d["reads"] = []
131
        for i in range(size):
132
            self.d["reads"].append(0)
133
134
135
        
        
    ### 
136
137
    def __iadd__(self, other):
        ### Not used now
138
        """Add other.reads to self.reads in-place, without extending lists"""
139

140
141
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
142
143
144

        return self
        
145
    def __add__(self, other):
146
        """Concat two windows, extending lists such as 'reads'"""
147
        #data we don't need to duplicate
148
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
149
150
        obj = Window(1)
        
151
152
153
154
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
155
156
157
158
159
160
161
162
163
164
165
166
167
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
168
169
170
171
172
    def latex(self, point=0, base=10000):
        reads = self.d["reads"][point]
        ratio = float(reads)/base
        return r"   &   & %7d & %5.2f%% & %-50s \\ %% %s" % (reads, ratio * 100, 
                                                           self.d["name"] if 'name' in self.d else self.d["id"], self.d["id"])
173

174
175
    ### print essential info about Window
    def __str__(self):
176
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxint else self.d["top"], self.d["id"])
177
        
Marc Duez's avatar
Marc Duez committed
178
class Samples: 
179

Marc Duez's avatar
Marc Duez committed
180
181
    def __init__(self):
        self.d={}
182
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
183
184
185
186
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
187

188
189
190
191
192
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

193
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
194
195
        
        return obj
196

197
198
199
    def __str__(self):
        return "<Samples: %s>" % self.d

200
201
202
203
204
205
206
207
208
209
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}

    def __add__(self, other):
        obj=Reads()
210
211
212
213
214

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
215
216
217
218
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
219
220
221

    def __str__(self):
        return "<Reads: %s>" % self.d
222
        
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

    def __init__(self, length, ranges = [1000, 100, 10, 1]):
        self.ranges = ranges
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

239
        for i, s in enumerate(window.d["reads"]):
240
241
242
243
244
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
245
246
            
        # print window, '-->', self.sizes
247
248
249
250
251
252
253
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
254
            w.d['reads'] = self.sizes[r]
255
256
257
258
259
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

            print '  --[others]-->', w
            yield w
260
261
262
263
264
        
class ListWindows:
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
265
    <ListWindows: [25] 2>
266
267
268
269
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
270
    <ListWindows: [34] 2>
271
272
273
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
274
    >>> lw3 = lw1 + lw2
275
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
276
    <ListWindows: [25, 34] 3>
277
278
279
280
281
282
283
284
285
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
    
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
286
287
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
288
289
        self.d["clones"] = []
        self.d["clusters"] = []
290
        self.d["germlines"] = {}
291
        
292
293
294
295
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
296
    def __str__(self):
297
298
299
300
301
302
303
304
305
306
307
308
309
310
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
311
312
313
314

    ### print info about each Windows stored 
    def info(self):
        print self
315
316
        for clone in self:
            print clone
317
318
319
320

    ### check vidjil_json_version
    def check_version(self, filepath):
        if "vidjil_json_version" in self.d:
321
            if self.d["vidjil_json_version"] <= VIDJIL_JSON_VERSION:
322
323
324
                return
        raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
        
325
326
327
    ### compute statistics about clones
    def build_stat(self):
        ranges = [1000, 100, 10, 1]
328
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
329

330
331
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
332
333
334
335
336
337
338
339
340
341
342
343
                for r in range(len(ranges)):
                    if s >= ranges[r]:
                        break 
                result[r][i] += s
                
        print result
        for r in range(len(ranges)):
            self.d["reads-distribution-"+str(ranges[r])] = result[r]
            
        #TODO V/D/J distrib and more 
        
        
344
345
346
347
348
349
350
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
        print "==>", output
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
            
351
    def load(self, file_path, pipeline, verbose=True):
352
353
354
355
356
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
357
358
359

        if verbose:
            print "<==", file_path, "\t",
360
        
361
        try:
362
363
364
365
366
367
        
            with open(file_path, "r") as file:
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
368
        except Exception: 
369
            self.load_clntab(file_path)
370
        pass
371
        
372
373
374
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
375
376
            if verbose:
                print "[%s]" % f
377
378
379

        else:
            f = file_path
380
381
            if verbose:
                print
382
        
Marc Duez's avatar
Marc Duez committed
383
        time = os.path.getmtime(file_path)
384
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
385

Marc Duez's avatar
Marc Duez committed
386
387
388
    def getTop(self, top):
        result = []
        
389
390
391
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
392
393
394
395
396
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
397
        reverseList = {}
398
399
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
400
401
402
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
403
        
Marc Duez's avatar
Marc Duez committed
404
        for i in filteredList:
405
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
406
        
407
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
408
        
409
410
411
412
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
413
414
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
415

416
417
418
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
419
420
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
421
        
422
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
423
424
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
425
426
427
428
429
        obj.d["germlines"] = []
        if "germlines" in self.d: 
            obj.d["germlines"] += self.d["germlines"].items() 
        if "germlines" in other.d: 
            obj.d["germlines"] += other.d["germlines"].items() 
430
        
431
432
        return obj
        
433
434
435
436
437
438
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
439
        self.d["clones"] += other.d["clones"]
440
441
442
443
444
445
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
446
    ### 
447
    def fuseWindows(self, w1, w2, l1, l2) :
448
        #store data in dict with "id" as key
449
450
        dico1 = {}
        for i in range(len(w1)) :
451
            dico1[ w1[i].d["id"] ] = w1[i]
452
453
454

        dico2 = {}
        for i in range(len(w2)) :
455
            dico2[ w2[i].d["id"] ] = w2[i]
456

457
        #concat data with the same key ("id")
458
459
460
461
462
463
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
464
                w=Window(l2)
465
466
467
468
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
469
                w=Window(l1)
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
487
488
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
489

490
        length = len(self)
491
        w=[]
492
493
494

        others = OtherWindows(nb_points)

495
496
497
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
498
            #else:
499
                #others += clone
500

501
        self.d["clones"] = w #+ list(others) 
502
503
504
505
506
507
508
509

        print "### Cut merged file, keeping window in the top %d for at least one point" % limit
        return self
        
    def load_clntab(self, file_path):
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
510
        self.d["samples"].d["original_names"] = [file_path]
511
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
527
528
                w.d["seg"] = {}
                
529
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
530
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
531
532
                s = int(tab["sequence.size"])
                total_size += s
533
                w.d["reads"] = [ s ]
534

535
536
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
537
                w.d["sequence"] = tab["sequence.raw nt seq"]
538
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
539
                if (tab["sequence.D-GENE and allele"] != "") :
540
541
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
542
543
544
545

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
546
                
547
                if position >= 0:
548
549
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
550
                else:
551
552
553
554
555
556
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
557
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"]
558
                    
559
                listw.append((w , w.d["reads"][0]))
560
561

                raw_clonotype = tab.get("clonotype")
562
563
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
580
            self.d["clones"].append(listw[index][0])
581
        
582
583
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
584
585
586
587

        
    def toJson(self, obj):
        '''Serializer for json module'''
588
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
589
590
591
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
592
                
593
594
            return result
            raise TypeError(repr(obj) + " fail !") 
595
        
Marc Duez's avatar
Marc Duez committed
596
        else:
597
598
599
600
601
602
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
603
604
605

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
606
        if "samples" in obj_dict:
607
            obj = ListWindows()
608
            obj.d=obj_dict
609
610
            return obj

611
        if "id" in obj_dict:
612
            obj = Window(1)
613
            obj.d=obj_dict
614
            return obj
Marc Duez's avatar
Marc Duez committed
615
        
616
617
        if "total" in obj_dict:
            obj = Reads()
618
            obj.d=obj_dict
619
620
            return obj
            
Marc Duez's avatar
Marc Duez committed
621
622
        if "original_names" in obj_dict:
            obj = Samples()
623
            obj.d=obj_dict
624
            return obj
Marc Duez's avatar
Marc Duez committed
625
            
626
        return obj_dict
627
628
629
630
631
632
633
634
635
636
637
638
639
640
        


######
def common_substring(l):
    '''Return the longest common substring among the strings in the list
    >>> common_substring(['abcdfffff', 'ghhhhhhhhhbcd'])
    'bcd'
    >>> common_substring(['abcdfffff', 'ghhhhhhhhh'])
    ''
    >>> common_substring(['b-abc-123', 'tuvwxyz-abc-321', 'tuvwxyz-abc-456', 'd-abc-789'])
    '-abc-'
    '''

641
642
643
644
    table = []
    for s in l:
        # adds in table all substrings of s - duplicate substrings in s are added only once
        table += set(s[j:k] for j in range(len(s)) for k in range(j+1, len(s)+1))
645
646
647
648
649
650
651
652
653
654

    # sort substrings by length (descending)
    table = sorted(table, cmp=lambda x,y: cmp(len(y), len(x)))
    # get the position of duplicates and get the first one (longest)
    duplicates=[i for i, x in enumerate(table) if table.count(x) == len(l)]
    if len(duplicates) > 0:
        return table[duplicates[0]]
    else:
        return ""

655
def interesting_substrings(l, target_length=6, substring_replacement='-'):
656
657
    '''Return a list with intersting substrings.
    Now it removes common prefixes and suffixes, and then the longest 
Marc Duez's avatar
merge    
Marc Duez committed
658
659
    common substring. 
    But it stops removing once all lengths are at most 'target_length'.
660

Marc Duez's avatar
merge    
Marc Duez committed
661
    >>> interesting_substrings(['ec-3--bla', 'ec-512-bla', 'ec-47-bla'], target_length=0)
662
    ['3-', '512', '47']
663
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0, substring_replacement='')
664
    ['A1', 'B512', 'C21']
665
666
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0)
    ['A-1', 'B-512', 'C-21']
Marc Duez's avatar
merge    
Marc Duez committed
667
668
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=9)
    ['A-seq-1', 'B-seq-512', 'C-seq-21']
669
670
671
672
673
    '''

    if not l:
        return {}

Marc Duez's avatar
merge    
Marc Duez committed
674
675
676
    if max(map (len, l)) <= target_length:
        return l

677
678
    min_length = min(map (len, l))

Marc Duez's avatar
merge    
Marc Duez committed
679
680
    ### Remove prefixes

681
682
683
684
685
686
687
    common_prefix = 0
    for i in range(min_length):
        if all(map(lambda x: x[i] == l[0][i], l)):
            common_prefix = i+1
        else:
            break

Marc Duez's avatar
merge    
Marc Duez committed
688
689
690
691
692
693
694
    substrings = [x[common_prefix:] for x in l]

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove suffixes

695
696
697
698
699
700
    common_suffix = 0
    for i in range(min_length - common_prefix):
        if all(map(lambda x: x[-(i+1)] == l[0][-(i+1)], l)):
            common_suffix = i
        else:
            break
Marc Duez's avatar
merge    
Marc Duez committed
701
702
703
704
705
706
707

    substrings = [x[common_prefix:-(common_suffix+1)] for x in l]            

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove the longest common substring
WebTogz's avatar
WebTogz committed
708
709
    
    #Have to replace '' by '_' if the removal have place between 2 substrings 
Marc Duez's avatar
merge    
Marc Duez committed
710

711
    common = common_substring(substrings)
712
713
    if common:
        substrings = [s.replace(common, substring_replacement) for s in substrings]
Marc Duez's avatar
merge    
Marc Duez committed
714

715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
    return substrings
    
    # ### Build dict
    # substrings = {}
    # for x in l:
    #     substrings[x] = x[common_prefix:-(common_suffix+1)]
    # return substrings

 
class AccessedDict(dict):
    '''Dictionary providing a .not_accessed_keys() method
    Note that access with .get(key) are not tracked.

    >>> d = AccessedDict({1:11, 2:22, 3: 33, 4: 44})

    >>> d[1], d[3]
    (11, 33)

    >>> list(d.not_accessed_keys())
    [2, 4]
    '''

    def __init__(self, *args, **kwargs):
        dict.__init__(self, *args, **kwargs)
        self.accessed_keys = []

    def __getitem__(self, key):
        self.accessed_keys.append(key)
        return dict.__getitem__(self, key)

    def not_accessed_keys(self):
        for key in self.keys():
            if key in self.accessed_keys:
                continue
            yield key





### some data used for test
w1 = Window(1)
757
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
758
w2 = Window(1)
759
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
760
w3 = Window(1)
761
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
762
w4 = Window(1)
763
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
764
765
766


w5 = Window(1)
767
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
768
w6 = Window(1)
769
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
770
771

lw1 = ListWindows()
772
lw1.d["timestamp"] = 'ts'
Mathieu Giraud's avatar
Mathieu Giraud committed
773
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {} }', object_hook=lw1.toPython)
774
775
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
776
777

w7 = Window(1)
778
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
779
w8 = Window(1)
780
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
781
782

lw2 = ListWindows()
783
lw2.d["timestamp"] = 'ts'
Mathieu Giraud's avatar
Mathieu Giraud committed
784
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {} }', object_hook=lw1.toPython)
785
786
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
787
788
789
790
791
792
793
794
795
796
797
798
799
800

    
    

 
def main():
    print "#", ' '.join(sys.argv)

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
801
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
802
803
804
805
806
807
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
808
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
809
    
810
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
811
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
812

Marc Duez's avatar
Marc Duez committed
813
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

    print "### fuse.py -- " + DESCRIPTION
    print

Marc Duez's avatar
Marc Duez committed
831
832
833
834
835
836
837
838
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
839
    if args.multi:
840
841
        for path_name in args.file:
            jlist = ListWindows()
842
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
843
            jlist.build_stat()
844
845
846
847
848
849
850
            
            print "\t", jlist,

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
851
                
852
            print '\t==> merge to', jlist_fused
853
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
854
        
855
856
857
858
    else:
        print "### Read and merge input files"
        for path_name in args.file:
            jlist = ListWindows()
859
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
860
861
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
862

863
864
865
866
867
868
869
870
871
872
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
            print "\t", jlist,
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
873
            
874
            print '\t==> merge to', jlist_fused
875

876
877
    if args.compress:
        print
878
879
        print "### Select point names"
        l = jlist_fused.d["samples"].d["original_names"]
880
881
882
        ll = interesting_substrings(l)
        print "  <==", l
        print "  ==>", ll
883
        jlist_fused.d["samples"].d["names"] = ll
884
885
    
    print
886
    if not args.multi:
887
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
888
889
890
891
892
893
894
895
896
897
    print "\t", jlist_fused 
    print

    print "### Save merged file"
    jlist_fused.save_json(args.output)

    
    
if  __name__ =='__main__':
    main()