fuse.py 27.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

#  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
#  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

import collections
import json
import argparse
import sys
import time
import copy
28
import os.path
Marc Duez's avatar
Marc Duez committed
29
import datetime
30
31
32
from operator import itemgetter


33
VIDJIL_JSON_VERSION = "2014.10"
34

35
36
37
38
39
40
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

def ordered(d, key=None):
    '''sorts a dictionary into an OrderedDict'''
    return collections.OrderedDict([(k, d[k]) for k in sorted(d, key=key)])

41
42
43
44
45
46
47
48
49
50
51
####


def concatenate_with_padding(d, 
                             d1, d1_size, 
                             d2, d2_size,
                             ignore_keys=[]):
        '''Concatenate two dictionaries d1 and d2 into d
        The dictionaries d1 and d2 store several values that are lists with d1_size and d2_size elements,
        and the resulting dictionary will store values that are lists with size d1_size + d2_size elements.
        Pads with lists [0, ... 0] data that appear in either only d1 or only d2.
52
        The values that are not lists are ignored (but this should not happen).
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

        >>> d = {}
        >>> d1 = { 'a': [1, 2], 'b': [11, 22], 'z':17 }
        >>> d2 = { 'a': [3, 4, 5], 'c': [333, 444, 555] } 
        >>> concatenate_with_padding(d, d1, 2, d2, 5, ['z'])
        >>> d
        {'a': [1, 2, 3, 4, 5], 'c': [0, 0, 333, 444, 555], 'b': [11, 22, 0, 0, 0, 0, 0]}
        '''

        t1=[]
        t2=[]
        
        for i in range(d1_size):
            t1.append(0)
    
        for i in range(d2_size):
            t2.append(0)
    
        for key in d1:
            if key in ignore_keys:
                continue
74
75
76
            if type(d1[key]) is not list:
                continue

77
78
79
80
81
82
83
            d[key] = d1[key]
            if key not in d2 :
                d[key] += t2

        for key in d2:
            if key in ignore_keys:
                continue
84
85
86
            if type(d2[key]) is not list:
                continue

87
88
89
90
91
92
93
            if key not in d :
                d[key] = t1 + d2[key]
            else :
                d[key] = d[key] + d2[key]



94
95
96
####

class Window:
97
    # Should be renamed "Clone"
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
121
        self.d["id"] = ""
122
        self.d["top"] = sys.maxint
123
        self.d["reads"] = []
124
        for i in range(size):
125
            self.d["reads"].append(0)
126
127
128
        
        
    ### 
129
130
    def __iadd__(self, other):
        ### Not used now
131
        """Add other.reads to self.reads in-place, without extending lists"""
132

133
134
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
135
136
137

        return self
        
138
    def __add__(self, other):
139
        """Concat two windows, extending lists such as 'reads'"""
140
        #data we don't need to duplicate
141
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
142
143
        obj = Window(1)
        
144
145
146
147
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
148
149
150
151
152
153
154
155
156
157
158
159
160
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
161
    def latex(self, point=0):
162
        return r"   &   & %7d & %-50s \\ %% %s" % (self.d["reads"][0], self.d["name"] if 'name' in self.d else self.d["id"], self.d["id"])
163

164
165
    ### print essential info about Window
    def __str__(self):
166
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxint else self.d["top"], self.d["id"])
167
        
Marc Duez's avatar
Marc Duez committed
168
class Samples: 
169

Marc Duez's avatar
Marc Duez committed
170
171
    def __init__(self):
        self.d={}
172
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
173
174
175
176
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
177

178
179
180
181
182
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

183
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
184
185
        
        return obj
186

187
188
189
    def __str__(self):
        return "<Samples: %s>" % self.d

190
191
192
193
194
195
196
197
198
199
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}

    def __add__(self, other):
        obj=Reads()
200
201
202
203
204

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
205
206
207
208
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
209
210
211

    def __str__(self):
        return "<Reads: %s>" % self.d
212
        
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

    def __init__(self, length, ranges = [1000, 100, 10, 1]):
        self.ranges = ranges
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

229
        for i, s in enumerate(window.d["reads"]):
230
231
232
233
234
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
235
236
            
        # print window, '-->', self.sizes
237
238
239
240
241
242
243
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
244
            w.d['reads'] = self.sizes[r]
245
246
247
248
249
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

            print '  --[others]-->', w
            yield w
250
251
252
253
254
        
class ListWindows:
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
255
    <ListWindows: [25] 2>
256
257
258
259
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
260
    <ListWindows: [34] 2>
261
262
263
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
264
    >>> lw3 = lw1 + lw2
265
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
266
    <ListWindows: [25, 34] 3>
267
268
269
270
271
272
273
274
275
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
    
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
276
277
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
278
279
        self.d["clones"] = []
        self.d["clusters"] = []
280
        self.d["germlines"] = {}
281
282
        
    def __str__(self):
283
284
285
286
287
288
289
290
291
292
293
294
295
296
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
297
298
299
300

    ### print info about each Windows stored 
    def info(self):
        print self
301
302
        for clone in self:
            print clone
303
304
305
306

    ### check vidjil_json_version
    def check_version(self, filepath):
        if "vidjil_json_version" in self.d:
307
            if self.d["vidjil_json_version"] <= VIDJIL_JSON_VERSION:
308
309
310
                return
        raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
        
311
312
313
    ### compute statistics about clones
    def build_stat(self):
        ranges = [1000, 100, 10, 1]
314
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
315

316
317
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
318
319
320
321
322
323
324
325
326
327
328
329
                for r in range(len(ranges)):
                    if s >= ranges[r]:
                        break 
                result[r][i] += s
                
        print result
        for r in range(len(ranges)):
            self.d["reads-distribution-"+str(ranges[r])] = result[r]
            
        #TODO V/D/J distrib and more 
        
        
330
331
332
333
334
335
336
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
        print "==>", output
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
            
337
    def load(self, file_path, pipeline, verbose=True):
338
339
340
341
342
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
343
344
345

        if verbose:
            print "<==", file_path, "\t",
346
        
347
        try:
348
349
350
351
352
353
        
            with open(file_path, "r") as file:
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
354
        except Exception: 
355
            self.load_clntab(file_path)
356
        pass
357
        
358
359
360
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
361
362
            if verbose:
                print "[%s]" % f
363
364
365

        else:
            f = file_path
366
367
            if verbose:
                print
368
        
Marc Duez's avatar
Marc Duez committed
369
370
        time = os.path.getmtime(file_path)
        self.d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
371

Marc Duez's avatar
Marc Duez committed
372
373
374
    def getTop(self, top):
        result = []
        
375
376
377
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
378
379
380
381
382
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
383
        reverseList = {}
384
385
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
386
387
388
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
389
        
Marc Duez's avatar
Marc Duez committed
390
        for i in filteredList:
391
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
392
        
393
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
394
        
395
396
397
398
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
399
400
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
401

402
403
404
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
405
406
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
407
        
408
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
409
410
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
Marc Duez's avatar
Marc Duez committed
411
        obj.d["timestamp"] = self.d["timestamp"]
412
        obj.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
413
414
        obj.d["germlines"] = dict(self.d["germlines"].items() + other.d["germlines"].items())
        
415
416
        return obj
        
417
418
419
420
421
422
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
423
        self.d["clones"] += other.d["clones"]
424
425
426
427
428
429
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
430
    ### 
431
    def fuseWindows(self, w1, w2, l1, l2) :
432
        #store data in dict with "id" as key
433
434
        dico1 = {}
        for i in range(len(w1)) :
435
            dico1[ w1[i].d["id"] ] = w1[i]
436
437
438

        dico2 = {}
        for i in range(len(w2)) :
439
            dico2[ w2[i].d["id"] ] = w2[i]
440

441
        #concat data with the same key ("id")
442
443
444
445
446
447
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
448
                w=Window(l2)
449
450
451
452
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
453
                w=Window(l1)
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
471
472
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
473

474
        length = len(self)
475
        w=[]
476
477
478

        others = OtherWindows(nb_points)

479
480
481
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
482
            #else:
483
                #others += clone
484

485
        self.d["clones"] = w #+ list(others) 
486
487
488
489
490
491
492
493

        print "### Cut merged file, keeping window in the top %d for at least one point" % limit
        return self
        
    def load_clntab(self, file_path):
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
494
        self.d["samples"].d["original_names"] = [file_path]
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
510
511
                w.d["seg"] = {}
                
512
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
513
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
514
515
                s = int(tab["sequence.size"])
                total_size += s
516
                w.d["reads"] = [ s ]
517

518
519
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
520
                w.d["sequence"] = tab["sequence.raw nt seq"]
521
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
522
                if (tab["sequence.D-GENE and allele"] != "") :
523
524
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
525
526
527
528

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
529
                
530
                if position >= 0:
531
532
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
533
                else:
534
535
536
537
538
539
540
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["name"]=w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
541
                    
542
                listw.append((w , w.d["reads"][0]))
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560

                raw_clonotype = tab.get("clonotype")
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
561
            self.d["clones"].append(listw[index][0])
562
        
563
564
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
565
566
567
568

        
    def toJson(self, obj):
        '''Serializer for json module'''
569
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
570
571
572
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
573
                
574
575
            return result
            raise TypeError(repr(obj) + " fail !") 
576
        
Marc Duez's avatar
Marc Duez committed
577
        else:
578
579
580
581
582
583
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
584
585
586

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
587
        if "samples" in obj_dict:
588
589
            obj = ListWindows()
            for key in obj_dict :
590
                obj.d[key]=obj_dict[key]
591
592
            return obj

593
        if "id" in obj_dict:
594
595
            obj = Window(1)
            for key in obj_dict :
596
                obj.d[key]=obj_dict[key]
597
            return obj
Marc Duez's avatar
Marc Duez committed
598
        
599
600
601
602
603
604
        if "total" in obj_dict:
            obj = Reads()
            for key in obj_dict :
                obj.d[key]=obj_dict[key]
            return obj
            
Marc Duez's avatar
Marc Duez committed
605
606
        if "original_names" in obj_dict:
            obj = Samples()
607
            for key in obj_dict :
Marc Duez's avatar
Marc Duez committed
608
                obj.d[key]=obj_dict[key]
609
            return obj
Marc Duez's avatar
Marc Duez committed
610
611
612
613
614
            
        res = {}
        for key in obj_dict :
            res[key]=obj_dict[key]
        return res
615
616
617
618
619
620
621
622
623
624
625
626
627
628
        


######
def common_substring(l):
    '''Return the longest common substring among the strings in the list
    >>> common_substring(['abcdfffff', 'ghhhhhhhhhbcd'])
    'bcd'
    >>> common_substring(['abcdfffff', 'ghhhhhhhhh'])
    ''
    >>> common_substring(['b-abc-123', 'tuvwxyz-abc-321', 'tuvwxyz-abc-456', 'd-abc-789'])
    '-abc-'
    '''

629
630
631
632
    table = []
    for s in l:
        # adds in table all substrings of s - duplicate substrings in s are added only once
        table += set(s[j:k] for j in range(len(s)) for k in range(j+1, len(s)+1))
633
634
635
636
637
638
639
640
641
642

    # sort substrings by length (descending)
    table = sorted(table, cmp=lambda x,y: cmp(len(y), len(x)))
    # get the position of duplicates and get the first one (longest)
    duplicates=[i for i, x in enumerate(table) if table.count(x) == len(l)]
    if len(duplicates) > 0:
        return table[duplicates[0]]
    else:
        return ""

643
def interesting_substrings(l, target_length=6, substring_replacement='-'):
644
645
    '''Return a list with intersting substrings.
    Now it removes common prefixes and suffixes, and then the longest 
Marc Duez's avatar
merge    
Marc Duez committed
646
647
    common substring. 
    But it stops removing once all lengths are at most 'target_length'.
648

Marc Duez's avatar
merge    
Marc Duez committed
649
    >>> interesting_substrings(['ec-3--bla', 'ec-512-bla', 'ec-47-bla'], target_length=0)
650
    ['3-', '512', '47']
651
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0, substring_replacement='')
652
    ['A1', 'B512', 'C21']
653
654
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0)
    ['A-1', 'B-512', 'C-21']
Marc Duez's avatar
merge    
Marc Duez committed
655
656
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=9)
    ['A-seq-1', 'B-seq-512', 'C-seq-21']
657
658
659
660
661
    '''

    if not l:
        return {}

Marc Duez's avatar
merge    
Marc Duez committed
662
663
664
    if max(map (len, l)) <= target_length:
        return l

665
666
    min_length = min(map (len, l))

Marc Duez's avatar
merge    
Marc Duez committed
667
668
    ### Remove prefixes

669
670
671
672
673
674
675
    common_prefix = 0
    for i in range(min_length):
        if all(map(lambda x: x[i] == l[0][i], l)):
            common_prefix = i+1
        else:
            break

Marc Duez's avatar
merge    
Marc Duez committed
676
677
678
679
680
681
682
    substrings = [x[common_prefix:] for x in l]

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove suffixes

683
684
685
686
687
688
    common_suffix = 0
    for i in range(min_length - common_prefix):
        if all(map(lambda x: x[-(i+1)] == l[0][-(i+1)], l)):
            common_suffix = i
        else:
            break
Marc Duez's avatar
merge    
Marc Duez committed
689
690
691
692
693
694
695

    substrings = [x[common_prefix:-(common_suffix+1)] for x in l]            

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove the longest common substring
WebTogz's avatar
WebTogz committed
696
697
    
    #Have to replace '' by '_' if the removal have place between 2 substrings 
Marc Duez's avatar
merge    
Marc Duez committed
698

699
    common = common_substring(substrings)
700
701
    if common:
        substrings = [s.replace(common, substring_replacement) for s in substrings]
Marc Duez's avatar
merge    
Marc Duez committed
702

703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
    return substrings
    
    # ### Build dict
    # substrings = {}
    # for x in l:
    #     substrings[x] = x[common_prefix:-(common_suffix+1)]
    # return substrings

 
class AccessedDict(dict):
    '''Dictionary providing a .not_accessed_keys() method
    Note that access with .get(key) are not tracked.

    >>> d = AccessedDict({1:11, 2:22, 3: 33, 4: 44})

    >>> d[1], d[3]
    (11, 33)

    >>> list(d.not_accessed_keys())
    [2, 4]
    '''

    def __init__(self, *args, **kwargs):
        dict.__init__(self, *args, **kwargs)
        self.accessed_keys = []

    def __getitem__(self, key):
        self.accessed_keys.append(key)
        return dict.__getitem__(self, key)

    def not_accessed_keys(self):
        for key in self.keys():
            if key in self.accessed_keys:
                continue
            yield key





### some data used for test
w1 = Window(1)
745
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
746
w2 = Window(1)
747
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
748
w3 = Window(1)
749
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
750
w4 = Window(1)
751
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
752
753
754


w5 = Window(1)
755
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
756
w6 = Window(1)
757
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
758
759

lw1 = ListWindows()
Mathieu Giraud's avatar
Mathieu Giraud committed
760
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25] }', object_hook=lw1.toPython)
761
762
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
763
764

w7 = Window(1)
765
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
766
w8 = Window(1)
767
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
768
769

lw2 = ListWindows()
Mathieu Giraud's avatar
Mathieu Giraud committed
770
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34] }', object_hook=lw1.toPython)
771
772
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
773
774
775
776
777
778
779
780
781
782
783
784
785
786

    
    

 
def main():
    print "#", ' '.join(sys.argv)

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
787
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
788
789
790
791
792
793
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
794
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
795
    
796
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
797
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
798

Marc Duez's avatar
Marc Duez committed
799
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

    print "### fuse.py -- " + DESCRIPTION
    print

Marc Duez's avatar
Marc Duez committed
817
818
819
820
821
822
823
824
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
825
    if args.multi:
826
827
        for path_name in args.file:
            jlist = ListWindows()
828
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
829
            jlist.build_stat()
830
831
832
833
834
835
836
            
            print "\t", jlist,

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
837
                
838
            print '\t==> merge to', jlist_fused
839
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
840
        
841
842
843
844
    else:
        print "### Read and merge input files"
        for path_name in args.file:
            jlist = ListWindows()
845
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
846
847
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
848

849
850
851
852
853
854
855
856
857
858
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
            print "\t", jlist,
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
859
            
860
            print '\t==> merge to', jlist_fused
861

862
863
    if args.compress:
        print
864
865
        print "### Select point names"
        l = jlist_fused.d["samples"].d["original_names"]
866
867
868
        ll = interesting_substrings(l)
        print "  <==", l
        print "  ==>", ll
869
        jlist_fused.d["samples"].d["names"] = ll
870
871
    
    print
872
    if not args.multi:
873
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
874
875
876
877
878
879
880
881
882
883
    print "\t", jlist_fused 
    print

    print "### Save merged file"
    jlist_fused.save_json(args.output)

    
    
if  __name__ =='__main__':
    main()