fuse.py 26.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

#  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vidjil>
#  Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

import collections
import json
import argparse
import sys
import time
import copy
28
import os.path
Marc Duez's avatar
Marc Duez committed
29
import datetime
30
31
32
from operator import itemgetter


Marc Duez's avatar
Marc Duez committed
33
VIDJIL_JSON_VERSION = "2014.09"
34

35
36
37
38
39
40
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

def ordered(d, key=None):
    '''sorts a dictionary into an OrderedDict'''
    return collections.OrderedDict([(k, d[k]) for k in sorted(d, key=key)])

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
####


def concatenate_with_padding(d, 
                             d1, d1_size, 
                             d2, d2_size,
                             ignore_keys=[]):
        '''Concatenate two dictionaries d1 and d2 into d
        The dictionaries d1 and d2 store several values that are lists with d1_size and d2_size elements,
        and the resulting dictionary will store values that are lists with size d1_size + d2_size elements.
        Pads with lists [0, ... 0] data that appear in either only d1 or only d2.

        >>> d = {}
        >>> d1 = { 'a': [1, 2], 'b': [11, 22], 'z':17 }
        >>> d2 = { 'a': [3, 4, 5], 'c': [333, 444, 555] } 
        >>> concatenate_with_padding(d, d1, 2, d2, 5, ['z'])
        >>> d
        {'a': [1, 2, 3, 4, 5], 'c': [0, 0, 333, 444, 555], 'b': [11, 22, 0, 0, 0, 0, 0]}
        '''

        t1=[]
        t2=[]
        
        for i in range(d1_size):
            t1.append(0)
    
        for i in range(d2_size):
            t2.append(0)
    
        for key in d1:
            if key in ignore_keys:
                continue
            d[key] = d1[key]
            if key not in d2 :
                d[key] += t2

        for key in d2:
            if key in ignore_keys:
                continue
            if key not in d :
                d[key] = t1 + d2[key]
            else :
                d[key] = d[key] + d2[key]



87
88
89
####

class Window:
90
    # Should be renamed "Clone"
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
114
        self.d["id"] = ""
115
        self.d["top"] = sys.maxint
116
        self.d["reads"] = []
117
        for i in range(size):
118
            self.d["reads"].append(0)
119
120
121
        
        
    ### 
122
123
    def __iadd__(self, other):
        ### Not used now
124
        """Add other.reads to self.reads in-place, without extending lists"""
125

126
127
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
128
129
130

        return self
        
131
    def __add__(self, other):
132
        """Concat two windows, extending lists such as 'reads'"""
133
        #data we don't need to duplicate
134
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
135
136
137
138
139
        obj = Window(1)
        
        t1 = []
        t2 = []
        
140
        for i in range(len(self.d["reads"])):
141
            t1.append(0)
142
        for i in range(len(other.d["reads"])):
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
            t2.append(0)
            
        #concat data, if there is some missing data we use an empty buffer t1/t2 
        #with the same size as the number of misssing data
        for key in self.d :
            if key not in myList :
                obj.d[key] = self.d[key]
                if key not in other.d :
                    obj.d[key] += t2
        
        for key in other.d :
            if key not in myList :
                if key not in obj.d :
                    obj.d[key] = t1 + other.d[key]
                else :
                    obj.d[key] += other.d[key]
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
    ### print essential info about Window
    def __str__(self):
174
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxint else self.d["top"], self.d["id"])
175
        
Marc Duez's avatar
Marc Duez committed
176
class Samples: 
177

Marc Duez's avatar
Marc Duez committed
178
179
180
181
182
183
184
    def __init__(self):
        self.d={}
        self.d["number"] = "1"
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
185

186
187
188
189
190
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

191
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
192
193
        
        return obj
194

195
196
197
    def __str__(self):
        return "<Samples: %s>" % self.d

198
199
200
201
202
203
204
205
206
207
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}

    def __add__(self, other):
        obj=Reads()
208
209
210
211
212

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
213
214
215
216
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
217
218
219

    def __str__(self):
        return "<Reads: %s>" % self.d
220
        
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

    def __init__(self, length, ranges = [1000, 100, 10, 1]):
        self.ranges = ranges
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

237
        for i, s in enumerate(window.d["reads"]):
238
239
240
241
242
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
243
244
            
        # print window, '-->', self.sizes
245
246
247
248
249
250
251
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
252
            w.d['reads'] = self.sizes[r]
253
254
255
256
257
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

            print '  --[others]-->', w
            yield w
258
259
260
261
262
        
class ListWindows:
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
263
    <ListWindows: [25] 2>
264
265
266
267
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
268
    <ListWindows: [34] 2>
269
270
271
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
272
    >>> lw3 = lw1 + lw2
273
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
274
    <ListWindows: [25, 34] 3>
275
276
277
278
279
280
281
282
283
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
    
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
Marc Duez's avatar
Marc Duez committed
284
        self.d["samples"] = [Samples()]
285
286
287
        self.d["reads"] = ""
        self.d["clones"] = []
        self.d["clusters"] = []
288
289
        
    def __str__(self):
290
291
292
293
294
295
296
297
298
299
300
301
302
303
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
304
305
306
307

    ### print info about each Windows stored 
    def info(self):
        print self
308
309
        for clone in self:
            print clone
310
311
312
313
314
315
316
317

    ### check vidjil_json_version
    def check_version(self, filepath):
        if "vidjil_json_version" in self.d:
            if self.d["vidjil_json_version"][0] >= VIDJIL_JSON_VERSION:
                return
        raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
        
318
319
320
    ### compute statistics about clones
    def build_stat(self):
        ranges = [1000, 100, 10, 1]
321
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
322

323
324
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
325
326
327
328
329
330
331
332
333
334
335
336
                for r in range(len(ranges)):
                    if s >= ranges[r]:
                        break 
                result[r][i] += s
                
        print result
        for r in range(len(ranges)):
            self.d["reads-distribution-"+str(ranges[r])] = result[r]
            
        #TODO V/D/J distrib and more 
        
        
337
338
339
340
341
342
343
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
        print "==>", output
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
            
344
    def load(self, file_path, pipeline):
345
346
347
348
349
350
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
        
351
        print "<==", file_path, "\t",
352
        '''
353
        try:
354
355
356
357
358
359
360
361
        '''
        with open(file_path, "r") as file:
            tmp = json.load(file, object_hook=self.toPython)     
            tmp.d['reads'] = tmp.d['reads'][0]
            tmp.d['samples'] = tmp.d['samples'][0]
            self.d=tmp.d
            self.check_version(file_path)
        '''
362
        except Exception: 
363
            self.load_clntab(file_path)
364
        pass
365
        '''
366
367
368
369
370
371
372
373
374
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
            print "[%s]" % f

        else:
            f = file_path
            print
        
375

Marc Duez's avatar
Marc Duez committed
376
377
378
    def getTop(self, top):
        result = []
        
379
380
381
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
382
383
384
385
386
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
387
        reverseList = {}
388
389
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
390
391
392
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
393
        
Marc Duez's avatar
Marc Duez committed
394
        for i in filteredList:
395
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
396
        
397
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
398
        
399
400
401
402
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
403
404
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
405

406
407
408
409
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
                                 ["clones", "links"])
410
        
411
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
412
413
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
414
415
416
417
        obj.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]

        return obj
        
418
419
420
421
422
423
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
424
        self.d["clones"] += other.d["clones"]
425
426
427
428
429
430
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
431
    ### 
432
    def fuseWindows(self, w1, w2, l1, l2) :
433
        #store data in dict with "id" as key
434
435
        dico1 = {}
        for i in range(len(w1)) :
436
            dico1[ w1[i].d["id"] ] = w1[i]
437
438
439

        dico2 = {}
        for i in range(len(w2)) :
440
            dico2[ w2[i].d["id"] ] = w2[i]
441

442
        #concat data with the same key ("id")
443
444
445
446
447
448
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
449
                w=Window(l2)
450
451
452
453
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
454
                w=Window(l1)
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
472
473
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
474

475
        length = len(self)
476
        w=[]
477
478
479

        others = OtherWindows(nb_points)

480
481
482
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
483
            #else:
484
                #others += clone
485

486
        self.d["clones"] = w #+ list(others) 
487
488
489
490
491
492
493
494

        print "### Cut merged file, keeping window in the top %d for at least one point" % limit
        return self
        
    def load_clntab(self, file_path):
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
Marc Duez's avatar
Marc Duez committed
495
496
497
        time = os.path.getmtime(file_path)
        self.d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
        self.d["samples"][0].d["original_names"] = [file_path]
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
514
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
515
516
                s = int(tab["sequence.size"])
                total_size += s
517
                w.d["reads"]=[ s ]
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539

                w.d["sequence"] = tab["sequence.raw nt seq"]
                w.d["V"]=tab["sequence.V-GENE and allele"].split('=')
                if (tab["sequence.D-GENE and allele"] != "") :
                    w.d["D"]=tab["sequence.D-GENE and allele"].split('=')
                w.d["J"]=tab["sequence.J-GENE and allele"].split('=')

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
                if position >= 0:
                    w.d["Jstart"] = position + len(junction)
                    w.d["Vend"] = position 
                    w.d["Nlength"] = len(junction)
                else:
                    w.d["Jstart"] = 0
                    w.d["Vend"] = len(w.d["sequence"])
                    w.d["Nlength"] = 0
                w.d["name"]=w.d["V"][0] + " -x/y/-z " + w.d["J"][0]
                w.d["Dend"]=0
                w.d["Dstart"]=0
                    
540
                listw.append((w , w.d["reads"][0]))
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558

                raw_clonotype = tab.get("clonotype")
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
559
            self.d["clones"].append(listw[index][0])
560
561
562
563
564
565
        
        self.d["reads_segmented"] = [total_size]

        
    def toJson(self, obj):
        '''Serializer for json module'''
566
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
567
568
569
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
570
                
571
572
            return result
            raise TypeError(repr(obj) + " fail !") 
573
        
Marc Duez's avatar
Marc Duez committed
574
        else:
575
576
577
578
579
580
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
581
582
583

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
584
        if "samples" in obj_dict:
585
586
            obj = ListWindows()
            for key in obj_dict :
587
588
589
590
                if isinstance(obj_dict[key], list):
                    obj.d[key]=obj_dict[key]
                else :
                    obj.d[key]=[obj_dict[key]]
591
592
            return obj

593
        if "id" in obj_dict:
594
595
            obj = Window(1)
            for key in obj_dict :
596
                obj.d[key]=obj_dict[key]
597
            return obj
Marc Duez's avatar
Marc Duez committed
598
        
599
600
601
602
603
604
        if "total" in obj_dict:
            obj = Reads()
            for key in obj_dict :
                obj.d[key]=obj_dict[key]
            return obj
            
Marc Duez's avatar
Marc Duez committed
605
606
        if "original_names" in obj_dict:
            obj = Samples()
607
            for key in obj_dict :
Marc Duez's avatar
Marc Duez committed
608
609
610
611
612
613
614
                obj.d[key]=obj_dict[key]
            return [obj]
            
        res = {}
        for key in obj_dict :
            res[key]=obj_dict[key]
        return res
615
616
617
618
619
620
621
622
623
624
625
626
627
628
        


######
def common_substring(l):
    '''Return the longest common substring among the strings in the list
    >>> common_substring(['abcdfffff', 'ghhhhhhhhhbcd'])
    'bcd'
    >>> common_substring(['abcdfffff', 'ghhhhhhhhh'])
    ''
    >>> common_substring(['b-abc-123', 'tuvwxyz-abc-321', 'tuvwxyz-abc-456', 'd-abc-789'])
    '-abc-'
    '''

629
630
631
632
    table = []
    for s in l:
        # adds in table all substrings of s - duplicate substrings in s are added only once
        table += set(s[j:k] for j in range(len(s)) for k in range(j+1, len(s)+1))
633
634
635
636
637
638
639
640
641
642

    # sort substrings by length (descending)
    table = sorted(table, cmp=lambda x,y: cmp(len(y), len(x)))
    # get the position of duplicates and get the first one (longest)
    duplicates=[i for i, x in enumerate(table) if table.count(x) == len(l)]
    if len(duplicates) > 0:
        return table[duplicates[0]]
    else:
        return ""

643
def interesting_substrings(l, target_length=6, substring_replacement='-'):
644
645
    '''Return a list with intersting substrings.
    Now it removes common prefixes and suffixes, and then the longest 
Marc Duez's avatar
merge    
Marc Duez committed
646
647
    common substring. 
    But it stops removing once all lengths are at most 'target_length'.
648

Marc Duez's avatar
merge    
Marc Duez committed
649
    >>> interesting_substrings(['ec-3--bla', 'ec-512-bla', 'ec-47-bla'], target_length=0)
650
    ['3-', '512', '47']
651
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0, substring_replacement='')
652
    ['A1', 'B512', 'C21']
653
654
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=0)
    ['A-1', 'B-512', 'C-21']
Marc Duez's avatar
merge    
Marc Duez committed
655
656
    >>> interesting_substrings(['ec-A-seq-1-bla', 'ec-B-seq-512-bla', 'ec-C-seq-21-bla'], target_length=9)
    ['A-seq-1', 'B-seq-512', 'C-seq-21']
657
658
659
660
661
    '''

    if not l:
        return {}

Marc Duez's avatar
merge    
Marc Duez committed
662
663
664
    if max(map (len, l)) <= target_length:
        return l

665
666
    min_length = min(map (len, l))

Marc Duez's avatar
merge    
Marc Duez committed
667
668
    ### Remove prefixes

669
670
671
672
673
674
675
    common_prefix = 0
    for i in range(min_length):
        if all(map(lambda x: x[i] == l[0][i], l)):
            common_prefix = i+1
        else:
            break

Marc Duez's avatar
merge    
Marc Duez committed
676
677
678
679
680
681
682
    substrings = [x[common_prefix:] for x in l]

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove suffixes

683
684
685
686
687
688
    common_suffix = 0
    for i in range(min_length - common_prefix):
        if all(map(lambda x: x[-(i+1)] == l[0][-(i+1)], l)):
            common_suffix = i
        else:
            break
Marc Duez's avatar
merge    
Marc Duez committed
689
690
691
692
693
694
695

    substrings = [x[common_prefix:-(common_suffix+1)] for x in l]            

    if max(map (len, substrings)) <= target_length:
        return substrings

    ### Remove the longest common substring
WebTogz's avatar
WebTogz committed
696
697
    
    #Have to replace '' by '_' if the removal have place between 2 substrings 
Marc Duez's avatar
merge    
Marc Duez committed
698

699
    common = common_substring(substrings)
700
701
    if common:
        substrings = [s.replace(common, substring_replacement) for s in substrings]
Marc Duez's avatar
merge    
Marc Duez committed
702

703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
    return substrings
    
    # ### Build dict
    # substrings = {}
    # for x in l:
    #     substrings[x] = x[common_prefix:-(common_suffix+1)]
    # return substrings

 
class AccessedDict(dict):
    '''Dictionary providing a .not_accessed_keys() method
    Note that access with .get(key) are not tracked.

    >>> d = AccessedDict({1:11, 2:22, 3: 33, 4: 44})

    >>> d[1], d[3]
    (11, 33)

    >>> list(d.not_accessed_keys())
    [2, 4]
    '''

    def __init__(self, *args, **kwargs):
        dict.__init__(self, *args, **kwargs)
        self.accessed_keys = []

    def __getitem__(self, key):
        self.accessed_keys.append(key)
        return dict.__getitem__(self, key)

    def not_accessed_keys(self):
        for key in self.keys():
            if key in self.accessed_keys:
                continue
            yield key





### some data used for test
w1 = Window(1)
745
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
746
w2 = Window(1)
747
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
748
w3 = Window(1)
749
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
750
w4 = Window(1)
751
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
752
753
754


w5 = Window(1)
755
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
756
w6 = Window(1)
757
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
758
759

lw1 = ListWindows()
Mathieu Giraud's avatar
Mathieu Giraud committed
760
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25] }', object_hook=lw1.toPython)
761
762
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
763
764

w7 = Window(1)
765
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
766
w8 = Window(1)
767
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
768
769

lw2 = ListWindows()
Mathieu Giraud's avatar
Mathieu Giraud committed
770
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34] }', object_hook=lw1.toPython)
771
772
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
773
774
775
776
777
778
779
780
781
782
783
784
785
786

    
    

 
def main():
    print "#", ' '.join(sys.argv)

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
787
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
788
789
790
791
792
793
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
Mathieu Giraud's avatar
Mathieu Giraud committed
794
    group_options.add_argument('--multi', action='store_true', help='merge multiple systems (experimental)')
795
    
796
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
797
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816

    group_options.add_argument('--output', '-o', type=str, default='fused.data', help='output file (%(default)s)')
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

    print "### fuse.py -- " + DESCRIPTION
    print

Marc Duez's avatar
Marc Duez committed
817
818
819
820
821
822
823
824
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
825
    if args.multi:
826
827
        for path_name in args.file:
            jlist = ListWindows()
828
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
829
            jlist.build_stat()
830
831
832
833
834
835
836
            
            print "\t", jlist,

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
837
                
838
            print '\t==> merge to', jlist_fused
839
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
840
        
841
842
843
844
    else:
        print "### Read and merge input files"
        for path_name in args.file:
            jlist = ListWindows()
845
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
846
847
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
848

849
850
851
852
853
854
855
856
857
858
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
            print "\t", jlist,
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
859
            
860
            print '\t==> merge to', jlist_fused
861

862
863
    if args.compress:
        print
864
865
        print "### Select point names"
        l = jlist_fused.d["samples"].d["original_names"]
866
867
868
        ll = interesting_substrings(l)
        print "  <==", l
        print "  ==>", ll
869
        jlist_fused.d["samples"].d["names"] = ll
870
871
    
    print
872
    if not args.multi:
873
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
874
875
876
877
878
879
880
881
882
883
    print "\t", jlist_fused 
    print

    print "### Save merged file"
    jlist_fused.save_json(args.output)

    
    
if  __name__ =='__main__':
    main()