fuse.py 40.4 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2
3
4
5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9
10
11
12
13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43
from pipes import quote
44

45
FUSE_VERSION = "vidjil fuse"
46

47
TOOL_SIMILARITY = "../algo/tools/similarity"
48
SIMILARITY_LIMIT = 100
49

50
51
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

52
53
54
####

class Window:
55
    # Should be renamed "Clone"
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
Thonier Florian's avatar
Thonier Florian committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    >>> w1.get_values("name")
    '?'
    
    >>> w1.get_values("top")
    3
  
    >>> w7   = Window(1)
    >>> w7.d = seg_w7
    >>> w7.d["seg"]["5"]["name"]
    'TRAV17*01'
      
    >>> w7.get_values("name")
    'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
    
    >>> w7.get_values("seg5")
    'TRAV17*01'
90
91
92
93
94
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
95
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
96
        self.d["top"] = sys.maxsize
97
        self.d["reads"] = []
98
        for i in range(size):
99
            self.d["reads"].append(0)
100
101
102
        
        
    ### 
103
104
    def __iadd__(self, other):
        ### Not used now
105
        """Add other.reads to self.reads in-place, without extending lists"""
106

107
108
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
109
110
111

        return self
        
112
    def __add__(self, other):
113
        """Concat two windows, extending lists such as 'reads'"""
114
        #data we don't need to duplicate
115
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
116
117
        obj = Window(1)
        
118
119
120
121
122
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
123
124
125
126
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
127
                    
128
129
130
131
132
133
134
135
136
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

137
138
        return obj
        
139
140
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
141

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
    def get_reads(self, t=-1):
        if t== -1:
            return self.d["reads"]
        else:
            return self.d["reads"][t]

    def is_segmented(self):
        if not "seg" in self.d.keys():
            return False
        return True


    def get_axes_values(self, axes, timepoint):
        """Return the values of given axes for this clone at a gvien timepoint"""
        values = []
        for axis in axes:
            values.append(self.get_values(axis, timepoint))
        return values


    def get_values(self, axis, timepoint=0):
        """Return the value of an axis for this clone at a given timepoint"""

        axes = {
            "top":      ["top"],
            "germline": ["germline"],
            "name":     ["name"],
            "seg5":     ["seg",  "5", "name"],
            "seg4":     ["seg",  "4", "name"],
            "seg4a":    ["seg", "4a", "name"],
            "seg4b":    ["seg", "4b", "name"],
            "seg3":     ["seg",  "3", "name"],
            "evalue":   ["evalue","val"],
            "seg5_delRight": ["seg","5","delRight"],
            "seg3_delLeft":  ["seg","3","delLeft"],
            "seg4_delRight": ["seg","4","delRight"],
            "seg3_delLeft":  ["seg","4","delLeft"],
            ### Positions
            "seg5_stop":  ["seg","5","stop"],
            "seg3_start": ["seg","3","start"],
            "seg4_stop":  ["seg","4","stop"],
            "seg4_start": ["seg","4","start"],
            "cdr3_stop":  ["seg","cdr3","stop"],
            "cdr3_start": ["seg","cdr3","start"],
            "junction_stop":  ["seg","junction","stop"],
            "junction_start": ["seg","junction","start"],
            "productive":     ["seg","junction","productive"],
            "lenSeqAverage" : ["_average_read_length"]
        }

        ### length
        axes_length = {
            "insert_53": ["seg3_start", "seg5_stop", -1],
            "insert_54": ["seg4_start", "seg5_stop", -1],
            "insert_43": ["seg3_start", "seg4_stop", -1],
            "lenCDR3": ["cdr3_stop", "cdr3_start", 0],
            "lenJunction": ["junction_stop", "junction_start", 0]
        }

        try:
            if axis in axes.keys():
                path  = axes[axis]
                depth = 0
                value = self.d
                if path[0] == "seg" and not self.is_segmented():
                    return "?"

                while depth != len(path):
                    value  = value[path[depth]]
                    depth += 1
                if type(value) is list:
                    # In these cases, should be a list of value at different timepoint
                    return value[timepoint]
                return value

            ############################################################
            ### These axes need to make some computation on clone fields
            if axis == "lenSeq":
                return len(self.d["sequence"])

            if axis in axes_length.keys():
                path = axes_length[axis]
                if path[0] == "seg" and not self.is_segmented():
                    return "not_segmented"
                return self.get_values(path[0]) - self.get_values(path[1]) + path[2]


            if axis == "GCContent":
                return self.getGCcontent()

            # FIXME: need to get the rigth position value
            # If the clone have multiple sample, need to select the current.
            # Should not apply if executed from the vidjil server, without previous fused files
            if axis == "lenSeqConsensus":
                return self.get_values("lenSeqAverage")/ self.get_values("lenSeq")
            if axis == "coverage":
                return round(self.d["_coverage"][0]*100, 2)

            if axis == "complete":
                germline = self.get_values("germline")
                if "+" in germline:
                    return False
                elif germline in ["unexpected", "IKZF1", "ERG"]:
                    # maybe more specific later ?
                    return germline
                elif germline in ["IGH","IGK","IGL","TRA","TRB","TRD","TRG"]:
                    # List can be set if new complete germline is defined
                    return True
                else:
                    return "unexpected"
            if axis == "rearrangement":
                return self.get_rearrangement()

            return "unknow_axis"
        except:
            return "?"

    def getGCcontent(self):
        """ Compute and retur nthe GC ratio of a sequence """
        if len(self.d["sequence"]) == 0:
            return "?"

        gc = 0
        for nt in self.d["sequence"]:
            if nt in "GgCc":
                gc += 1

        GCContent = gc / len(self.d["sequence"])
        # return rounded value for better association of clones
        return round( GCContent, 2)


    def get_rearrangement( self):
        """Get the rearrangement type of this clone"""
        if not self.is_segmented():
            return "?"

        complete = self.get_values("complete")
        seg_vals = {
            "5":  self.get_values("seg5"),
            "4a": self.get_values("seg4a"),
            "4":  self.get_values("seg4"),
            "4b": self.get_values("seg4b"),
            "3":  self.get_values("seg3")
        }

        if complete == True:
            if seg_vals["4a"] == "?" and seg_vals["4"] == "?" and seg_vals["4b"] == "?":
                return "VJ"
            elif seg_vals["4a"] == "?" and seg_vals["4b"] == "?":
                return "VDJ"
            elif seg_vals["4a"] == "?" or seg_vals["4b"] == "?":
                return "VDDJ"
            else:
                return "VDDDJ"

        elif complete == False:
            # cas compliqué, DJ etant un 53, DD aussi et VD aussi
            if "V" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "VD"
            elif "D" in seg_vals["5"] and "J" in seg_vals["3"]:
                return "DJ"
            elif "D" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "DD"

        elif complete in ["unexpected", "IKZF1", "ERG"]:
            return complete
        else:
            return "?"


313
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
314
        reads = self.d["reads"][point]
315
        ratio_germline = float(reads)/base_germline
316
        ratio = float(reads)/base
317
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
318
319
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
320

321
322
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
323
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
324
        
Marc Duez's avatar
Marc Duez committed
325
class Samples: 
326

Marc Duez's avatar
Marc Duez committed
327
328
    def __init__(self):
        self.d={}
329
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
330
331
332
333
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
334

335
336
337
338
339
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

340
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
341
342
        
        return obj
343

344
345
346
    def __str__(self):
        return "<Samples: %s>" % self.d

347
348
class Diversity: 

349
350
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

351
352
353
    def __init__(self, data=None):
        self.d={}

354
355
356
357
358
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
359

360
361
362
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
363
364
365
366
367
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
368
369
370
371
372
373
374
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
375
        self.d['distribution'] = {}
376
377
378

    def __add__(self, other):
        obj=Reads()
379
380
381
382
383

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
384
385
386
387
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
388
389
390
391
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
392
393
394

    def __str__(self):
        return "<Reads: %s>" % self.d
395
        
396
397
398
399
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

400
401
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
402
403
404
405
406
407
408
409
410
411
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

412
        for i, s in enumerate(window.d["reads"]):
413
414
415
416
417
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
418
419
            
        # print window, '-->', self.sizes
420
421
422
423
424
425
426
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
427
            w.d['reads'] = self.sizes[r]
428
429
430
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
431
            print('  --[others]-->', w)
432
            yield w
433
        
434
class ListWindows(VidjilJson):
435
436
437
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
438
    <ListWindows: [25] 2>
439
440
441
442
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
443
    <ListWindows: [34] 2>
444
445
446
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
447
    >>> lw3 = lw1 + lw2
448
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
449
    <ListWindows: [25, 34] 3>
450
451
452
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
453
454
455
456
457
458
459
460
461
462
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

463
464
465
466
467
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
468
469
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
470
471
        self.d["clones"] = []
        self.d["clusters"] = []
472
        self.d["germlines"] = {}
473
        
474
475
476
477
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
478
    def __str__(self):
479
480
481
482
483
484
485
486
487
488
489
490
491
492
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
493
494
495

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
496
        print(self)
497
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
498
            print(clone)
499
        
500
501
    ### compute statistics about clones
    def build_stat(self):
502
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
503
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
504

505
506
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
507
508
509
510
511
512
                if self.d['reads'].d['segmented'][i] > 0:
                    for r in range(len(ranges)):
                        if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
                            break
                    result[r][i] += s

513
        for r in range(len(ranges)):
514
515
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
516
517
518
519
            
        #TODO V/D/J distrib and more 
        
        
520
521
522
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
523
        print("==>", output)
524
525
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
526

527
    def load(self, file_path, *args, **kwargs):
528
529
530
531
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
532
533
534
535
536
537

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
538
539
540
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
541

542
543
544
545
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
546
        
547
548
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

573
574
575
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
576
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
577
                print("[%s]" % f)
578
579
580

        else:
            f = file_path
581
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
582
                print()
583

Marc Duez's avatar
Marc Duez committed
584
        time = os.path.getmtime(file_path)
585
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
586

587
588
589
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
590

Marc Duez's avatar
Marc Duez committed
591
592
593
    def getTop(self, top):
        result = []
        
594
595
596
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
597
598
599
600
601
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
602
        reverseList = {}
603
604
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
605
606
607
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
608
        
Marc Duez's avatar
Marc Duez committed
609
        for i in filteredList:
610
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
611
        
612
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
613
        
614
615
616
617
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
618
619
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
620

621
622
623
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
624
625
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
626
        
627
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
628
629
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
630
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
631
        
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
        try:
            ### Verify that same file is not present twice
            filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
            filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
            for filename in filename_jlist1:
                if filename in filename_jlist2:
                    raise( "Error, duplicate file name (in distributions) ")

            ### Append distributions of each files
            obj.d["distributions"] = {}
            obj.d["distributions"]["repertoires"] = {}
            obj.d["distributions"]["keys"]        = ["clones", "reads"]
            obj.d["distributions"]["filters"]     = {}
            obj.d["distributions"]["categories"]  = {}
            for filename in filename_jlist1:
                obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
            for filename in filename_jlist2:
                obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
        except:
            pass

653
654
        return obj
        
655
656
657
658
659
660
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
661
        self.d["clones"] += other.d["clones"]
662
663
664
665
666
667
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
668
    ### 
669
    def fuseWindows(self, w1, w2, l1, l2) :
670
        #store data in dict with "id" as key
671
672
        dico1 = {}
        for i in range(len(w1)) :
673
            dico1[ w1[i].d["id"] ] = w1[i]
674
675
676

        dico2 = {}
        for i in range(len(w2)) :
677
            dico2[ w2[i].d["id"] ] = w2[i]
678

679
        #concat data with the same key ("id")
680
681
682
683
684
685
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
686
                w=Window(l2)
687
688
689
690
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
691
                w=Window(l1)
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
709
710
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
711
712

        w=[]
713
714
715

        others = OtherWindows(nb_points)

716
717
718
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
719
            #else:
720
                #others += clone
721

722
        self.d["clones"] = w #+ list(others) 
723

Mathieu Giraud's avatar
Mathieu Giraud committed
724
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
725
726
        return self
        
727
    def load_clntab(self, file_path, *args, **kwargs):
728
729
730
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
731
        self.d["samples"].d["original_names"] = [file_path]
732
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
748
749
                w.d["seg"] = {}
                
750
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
751
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
752
753
                s = int(tab["sequence.size"])
                total_size += s
754
                w.d["reads"] = [ s ]
755

756
757
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
758
                w.d["sequence"] = tab["sequence.raw nt seq"]
759
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
760
                if (tab["sequence.D-GENE and allele"] != "") :
761
762
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
763
764
765
766

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
767
                
768
                if position >= 0:
769
770
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
771
                else:
772
773
774
775
776
777
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
778
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
779
                    
780
                listw.append((w , w.d["reads"][0]))
781
782

                raw_clonotype = tab.get("clonotype")
783
784
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
801
            self.d["clones"].append(listw[index][0])
802
        
803
804
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
805
806
807
808

        
    def toJson(self, obj):
        '''Serializer for json module'''
809
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
810
811
812
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
813
                
814
815
            return result
            raise TypeError(repr(obj) + " fail !") 
816
        
Marc Duez's avatar
Marc Duez committed
817
        else:
818
819
820
821
822
823
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
824
825
826

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
827
        if "samples" in obj_dict:
828
            obj = ListWindows()
829
            obj.d=obj_dict
830
831
            return obj

832
        if "id" in obj_dict:
833
            obj = Window(1)
834
            obj.d=obj_dict
835
            return obj
Marc Duez's avatar
Marc Duez committed
836
        
837
838
        if "total" in obj_dict:
            obj = Reads()
839
            obj.d=obj_dict
840
841
            return obj
            
Marc Duez's avatar
Marc Duez committed
842
843
        if "original_names" in obj_dict:
            obj = Samples()
844
            obj.d=obj_dict
845
            return obj
Marc Duez's avatar
Marc Duez committed
846
            
847
        return obj_dict
848
        
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
    def get_filename_pos(self, filename):
        """ filename is the key of distributions repertoires """
        return self.d["samples"].d["original_names"].index(filename)


    # ========================= #
    #  Distributions computing  #
    # ========================= #
    def init_distrib(self, list_distrib):
        """ Create distributions structures """
        self.d["distributions"] = {}
        self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
        self.d["distributions"]["keys"]        = ["clones", "reads"]
        self.d["distributions"]["filters"]     = []
        # Warning, will fail of there are same original_names; fixme

        nb_sample = self.d["samples"].d["number"]
        # prefill distribution for each file of jlist
        for filename in self.d["samples"].d["original_names"]:
            for distrib in list_distrib:
                self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
        return


    def compute_distribution(self, list_distrib):
        """ list_distrib is a list of distrib to compute """ 
        for filename in self.d["samples"].d["original_names"]:
            timepoint = self.get_filename_pos(filename)
            for clone in self.d["clones"]:
                if clone.d["reads"][timepoint] != 0:
                    for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
                        distrib      = list_distrib[distrib_pos]
                        clone_values = clone.get_axes_values( distrib, timepoint )
                        nb_reads     = clone.get_reads(self.get_filename_pos(filename))
                        self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
        return


    def add_clone_values(self, filename, distrib_pos, values, nb_reads):
        """ Add value of a clone to a specific distribution """
        obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
        obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
        self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
        return

    def recursive_add( self, obj, values, nb_reads):
        """ Add by recusivity the newer value to existing value; return the obj """
        if len(values) > 1:
            # Should increase for one depth
            nvalues = values[1:]
            obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
        else:
            # last depth
            if not obj:
                obj = defaultdict(lambda: False)
            if not obj[values[0]]:
                obj[values[0]] = [0, 0]

            obj[values[0]][0] += 1
            obj[values[0]][1] += nb_reads
        return obj

    def save_distributions(self, foname):
        # Compute aggreg just before export
        fo = open(foname, "w")
        json.dump(self.d["distributions"], fo, indent=2)
        fo.close()
        return
917
918
919
920
921



### some data used for test
w1 = Window(1)
922
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
923
w2 = Window(1)
924
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
925
w3 = Window(1)
926
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
927
w4 = Window(1)
928
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
929
930
931


w5 = Window(1)
932
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
933
w6 = Window(1)
934
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
935

936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
seg_w7 = {
  "germline": "TRA",
  "id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
  "name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
  "reads": [
    16
  ],
  "seg": {
    "3": {
      "delLeft": 5,
      "name": "TRAJ40*01",
      "start": 112
    },
    "5": {
      "delRight": 0,
      "name": "TRAV17*01",
      "stop": 105
    },
    "N": 6,
    "cdr3": {
      "aa": "ATDAG#SGTYKYI",
      "start": 96,
      "stop": 133
    }
  },
  "sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
  "top": 26
}

965
lw1 = ListWindows()
966
lw1.d["timestamp"] = 'ts'
967
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
968
969
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
970
971
lw1.d["diversity"] = Diversity()

972
973

w7 = Window(1)
974
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
975
w8 = Window(1)
976
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
977
978

lw2 = ListWindows()
979
lw2.d["timestamp"] = 'ts'
980
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
981
982
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
983
lw2.d["diversity"] = Diversity()
984
985

    
986
987
988
989
990
def exec_command(command, directory, input_file):
    '''
    Execute the command `command` from the directory
    `directory`. The executable must exist in
    this directory. No path changes are allowed in `command`.
991
    
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
    Returns the output filename (a .vidjil). 
    '''
    assert (not os.path.sep in command), "No {} allowed in the command name".format(os.path.sep)
    ff = tempfile.NamedTemporaryFile(suffix='.vidjil', delete=False)

    basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
    command_fullpath = basedir+os.path.sep+directory+os.path.sep+command
    com = '%s %s %s' % (quote(command_fullpath), quote(os.path.abspath(input_file)), ff.name)
    print(com)
    os.system(com)
    print()
    return ff.name

1005
1006
1007

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
1008
    print("#", ' '.join(sys.argv))
1009
1010
1011
1012
1013
1014
1015

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
1016
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
1017
1018
1019
1020
1021
1022
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
1023
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
1024
1025
    group_options.add_argument('--distributions', '-d', action='store_true', help='compute distributions')
    group_options.add_argument('--only_disributions', '-D', action='store_true', help='export only distributions')
1026
    
1027
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
1028
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
1029
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
1030

Marc Duez's avatar
Marc Duez committed
1031
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
1032
1033
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

1034
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
1035

Mathieu Giraud's avatar
Mathieu Giraud committed
1036
    group_options.add_argument('--pre', type=str,help='pre-process program (launched on each input .vidjil file) (needs defs.PRE_PROCESS_DIR)')
1037

1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Thonier Florian's avatar
Thonier Florian committed
1050
1051
    LIST_DISTRIBUTIONS   = []
    LIST_AXIS = ["germline",
1052
        "seg5", "seg4", "seg3",
Thonier Florian's avatar
Thonier Florian committed
1053
        "lenSeq", "lenCDR3",  
1054
1055
1056
        "lenSeqConsensus", "lenSeqAverage", "GCContent", "coverage",
        "seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg3_delLeft",
        "insert_53", "insert_54", "insert_43",
Thonier Florian's avatar
Thonier Florian committed
1057
        "productive", 
1058
        "rearrangement", "complete",
Thonier Florian's avatar
Thonier Florian committed
1059
1060
1061
1062
1063
        # "evalue", l'arrondir ?
        # "cdr3_stop", "cdr3_start", 
        #"junction_start", "junction_stop",
        #"seg5_stop", "seg3_start", "seg4_stop", "seg4_start",
        # "top", # "name"
1064
    ]
Thonier Florian's avatar
Thonier Florian committed
1065
1066
1067
1068
1069
1070

    for axis1 in LIST_AXIS:
        LIST_DISTRIBUTIONS.append([axis1])
        for axis2 in LIST_AXIS:
            if axis1 != axis2:
                LIST_DISTRIBUTIONS.append([axis1, axis2])
1071
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1072
1073
    print("### fuse.py -- " + DESCRIPTION)
    print()
1074

1075
1076
1077
1078
1079
1080
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

1081
1082
1083
1084
    if args.pre:
        print("Pre-processing files...")
        pre_processed_files = []
        for f in files:
1085
1086
            out_name = exec_command(args.pre, PRE_PROCESS_DIR, f)
            pre_processed_files.append(out_name)
1087
1088
        files = pre_processed_files

Marc Duez's avatar
Marc Duez committed
1089
1090
    #filtre
    f = []
1091

1092
1093
1094
1095
1096
    if args.ijson:
        from vidjilparser import VidjilParser
        vparser = VidjilParser()
        vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)

1097
    for path_name in files:
1098
1099
1100
        if args.ijson:
            json_clones = vparser.extract(path_name)
            clones = json.loads(json_clones)
1101
1102
            if clones["clones"] is not None:
                f += [c['id'] for c in clones["clones"]]
1103
1104
1105
1106
1107
        else:
            jlist = ListWindows()
            jlist.load(path_name, args.pipeline)
            f += jlist.getTop(args.top)

Marc Duez's avatar
Marc Duez committed
1108
1109
    f = sorted(set(f))
    
1110
1111
1112
1113
1114
    if args.ijson:
        vparser.reset()
        vparser.addPrefix('')
        vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)

1115
    if args.multi:
1116
        for path_name in files:
1117
            jlist = ListWindows()
1118
1119
1120
1121
1122
1123
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
1124
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1125
            print("\t", jlist, end=' ')
1126
1127
1128
1129
1130

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
1131
                
Mathieu Giraud's avatar
Mathieu Giraud committed
1132
            print('\t==> merge to', jlist_fused)
1133
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
1134
        
1135
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
1136
        print("### Read and merge input files")
1137
        for path_name in files:
1138
            jlist = ListWindows()
1139
1140
1141
1142
1143
1144
1145
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
                jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
1146

1147
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1148
            print("\t", jlist, end=' ')
1149
1150
1151
1152
1153
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
1154
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1155
            print('\t==> merge to', jlist_fused)
1156

1157
1158
    if args.distributions or args.only_disributions:
        print("### Compute distributions")