fuse.py 39.5 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2
3
4
5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6
7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9
10
11
12
13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32
33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43
from pipes import quote
44

45
FUSE_VERSION = "vidjil fuse"
46

47
TOOL_SIMILARITY = "../algo/tools/similarity"
48
SIMILARITY_LIMIT = 100
49

50
51
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

52
53
54
####

class Window:
55
    # Should be renamed "Clone"
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
79
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
80
        self.d["top"] = sys.maxsize
81
        self.d["reads"] = []
82
        for i in range(size):
83
            self.d["reads"].append(0)
84
85
86
        
        
    ### 
87
88
    def __iadd__(self, other):
        ### Not used now
89
        """Add other.reads to self.reads in-place, without extending lists"""
90

91
92
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
93
94
95

        return self
        
96
    def __add__(self, other):
97
        """Concat two windows, extending lists such as 'reads'"""
98
        #data we don't need to duplicate
99
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
100
101
        obj = Window(1)
        
102
103
104
105
106
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
107
108
109
110
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
111
                    
112
113
114
115
116
117
118
119
120
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

121
122
        return obj
        
123
124
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
125

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    def get_reads(self, t=-1):
        if t== -1:
            return self.d["reads"]
        else:
            return self.d["reads"][t]

    def is_segmented(self):
        if not "seg" in self.d.keys():
            return False
        return True


    def get_axes_values(self, axes, timepoint):
        """Return the values of given axes for this clone at a gvien timepoint"""
        values = []
        for axis in axes:
            values.append(self.get_values(axis, timepoint))
        return values


    def get_values(self, axis, timepoint=0):
        """Return the value of an axis for this clone at a given timepoint"""

        axes = {
            "top":      ["top"],
            "germline": ["germline"],
            "name":     ["name"],
            "seg5":     ["seg",  "5", "name"],
            "seg4":     ["seg",  "4", "name"],
            "seg4a":    ["seg", "4a", "name"],
            "seg4b":    ["seg", "4b", "name"],
            "seg3":     ["seg",  "3", "name"],
            "evalue":   ["evalue","val"],
            "seg5_delRight": ["seg","5","delRight"],
            "seg3_delLeft":  ["seg","3","delLeft"],
            "seg4_delRight": ["seg","4","delRight"],
            "seg3_delLeft":  ["seg","4","delLeft"],
            ### Positions
            "seg5_stop":  ["seg","5","stop"],
            "seg3_start": ["seg","3","start"],
            "seg4_stop":  ["seg","4","stop"],
            "seg4_start": ["seg","4","start"],
            "cdr3_stop":  ["seg","cdr3","stop"],
            "cdr3_start": ["seg","cdr3","start"],
            "junction_stop":  ["seg","junction","stop"],
            "junction_start": ["seg","junction","start"],
            "productive":     ["seg","junction","productive"],
            "lenSeqAverage" : ["_average_read_length"]
        }

        ### length
        axes_length = {
            "insert_53": ["seg3_start", "seg5_stop", -1],
            "insert_54": ["seg4_start", "seg5_stop", -1],
            "insert_43": ["seg3_start", "seg4_stop", -1],
            "lenCDR3": ["cdr3_stop", "cdr3_start", 0],
            "lenJunction": ["junction_stop", "junction_start", 0]
        }

        try:
            if axis in axes.keys():
                path  = axes[axis]
                depth = 0
                value = self.d
                if path[0] == "seg" and not self.is_segmented():
                    return "?"

                while depth != len(path):
                    value  = value[path[depth]]
                    depth += 1
                if type(value) is list:
                    # In these cases, should be a list of value at different timepoint
                    return value[timepoint]
                return value

            ############################################################
            ### These axes need to make some computation on clone fields
            if axis == "lenSeq":
                return len(self.d["sequence"])

            if axis in axes_length.keys():
                path = axes_length[axis]
                if path[0] == "seg" and not self.is_segmented():
                    return "not_segmented"
                return self.get_values(path[0]) - self.get_values(path[1]) + path[2]


            if axis == "GCContent":
                return self.getGCcontent()

            # FIXME: need to get the rigth position value
            # If the clone have multiple sample, need to select the current.
            # Should not apply if executed from the vidjil server, without previous fused files
            if axis == "lenSeqConsensus":
                return self.get_values("lenSeqAverage")/ self.get_values("lenSeq")
            if axis == "coverage":
                return round(self.d["_coverage"][0]*100, 2)

            if axis == "complete":
                germline = self.get_values("germline")
                if "+" in germline:
                    return False
                elif germline in ["unexpected", "IKZF1", "ERG"]:
                    # maybe more specific later ?
                    return germline
                elif germline in ["IGH","IGK","IGL","TRA","TRB","TRD","TRG"]:
                    # List can be set if new complete germline is defined
                    return True
                else:
                    return "unexpected"
            if axis == "rearrangement":
                return self.get_rearrangement()

            return "unknow_axis"
        except:
            return "?"

    def getGCcontent(self):
        """ Compute and retur nthe GC ratio of a sequence """
        if len(self.d["sequence"]) == 0:
            return "?"

        gc = 0
        for nt in self.d["sequence"]:
            if nt in "GgCc":
                gc += 1

        GCContent = gc / len(self.d["sequence"])
        # return rounded value for better association of clones
        return round( GCContent, 2)


    def get_rearrangement( self):
        """Get the rearrangement type of this clone"""
        if not self.is_segmented():
            return "?"

        complete = self.get_values("complete")
        seg_vals = {
            "5":  self.get_values("seg5"),
            "4a": self.get_values("seg4a"),
            "4":  self.get_values("seg4"),
            "4b": self.get_values("seg4b"),
            "3":  self.get_values("seg3")
        }

        if complete == True:
            if seg_vals["4a"] == "?" and seg_vals["4"] == "?" and seg_vals["4b"] == "?":
                return "VJ"
            elif seg_vals["4a"] == "?" and seg_vals["4b"] == "?":
                return "VDJ"
            elif seg_vals["4a"] == "?" or seg_vals["4b"] == "?":
                return "VDDJ"
            else:
                return "VDDDJ"

        elif complete == False:
            # cas compliqué, DJ etant un 53, DD aussi et VD aussi
            if "V" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "VD"
            elif "D" in seg_vals["5"] and "J" in seg_vals["3"]:
                return "DJ"
            elif "D" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "DD"

        elif complete in ["unexpected", "IKZF1", "ERG"]:
            return complete
        else:
            return "?"


297
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
298
        reads = self.d["reads"][point]
299
        ratio_germline = float(reads)/base_germline
300
        ratio = float(reads)/base
301
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
302
303
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
304

305
306
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
307
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
308
        
Marc Duez's avatar
Marc Duez committed
309
class Samples: 
310

Marc Duez's avatar
Marc Duez committed
311
312
    def __init__(self):
        self.d={}
313
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
314
315
316
317
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
318

319
320
321
322
323
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

324
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
325
326
        
        return obj
327

328
329
330
    def __str__(self):
        return "<Samples: %s>" % self.d

331
332
class Diversity: 

333
334
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

335
336
337
    def __init__(self, data=None):
        self.d={}

338
339
340
341
342
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
343

344
345
346
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
347
348
349
350
351
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
352
353
354
355
356
357
358
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
359
        self.d['distribution'] = {}
360
361
362

    def __add__(self, other):
        obj=Reads()
363
364
365
366
367

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
368
369
370
371
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
372
373
374
375
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
376
377
378

    def __str__(self):
        return "<Reads: %s>" % self.d
379
        
380
381
382
383
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

384
385
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
386
387
388
389
390
391
392
393
394
395
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

396
        for i, s in enumerate(window.d["reads"]):
397
398
399
400
401
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
402
403
            
        # print window, '-->', self.sizes
404
405
406
407
408
409
410
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
411
            w.d['reads'] = self.sizes[r]
412
413
414
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
415
            print('  --[others]-->', w)
416
            yield w
417
        
418
class ListWindows(VidjilJson):
419
420
421
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
422
    <ListWindows: [25] 2>
423
424
425
426
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
427
    <ListWindows: [34] 2>
428
429
430
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
431
    >>> lw3 = lw1 + lw2
432
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
433
    <ListWindows: [25, 34] 3>
434
435
436
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
437
438
439
440
441
442
443
444
445
446
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

447
448
449
450
451
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
452
453
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
454
455
        self.d["clones"] = []
        self.d["clusters"] = []
456
        self.d["germlines"] = {}
457
        
458
459
460
461
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
462
    def __str__(self):
463
464
465
466
467
468
469
470
471
472
473
474
475
476
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
477
478
479

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
480
        print(self)
481
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
482
            print(clone)
483
        
484
485
    ### compute statistics about clones
    def build_stat(self):
486
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
487
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
488

489
490
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
491
492
493
494
495
496
                if self.d['reads'].d['segmented'][i] > 0:
                    for r in range(len(ranges)):
                        if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
                            break
                    result[r][i] += s

497
        for r in range(len(ranges)):
498
499
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
500
501
502
503
            
        #TODO V/D/J distrib and more 
        
        
504
505
506
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
507
        print("==>", output)
508
509
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
510

511
    def load(self, file_path, *args, **kwargs):
512
513
514
515
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
516
517
518
519
520
521

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
522
523
524
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
525

526
527
528
529
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
530
        
531
532
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

557
558
559
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
560
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
561
                print("[%s]" % f)
562
563
564

        else:
            f = file_path
565
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
566
                print()
567

Marc Duez's avatar
Marc Duez committed
568
        time = os.path.getmtime(file_path)
569
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
570

571
572
573
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
574

Marc Duez's avatar
Marc Duez committed
575
576
577
    def getTop(self, top):
        result = []
        
578
579
580
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
581
582
583
584
585
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
586
        reverseList = {}
587
588
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
589
590
591
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
592
        
Marc Duez's avatar
Marc Duez committed
593
        for i in filteredList:
594
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
595
        
596
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
597
        
598
599
600
601
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
602
603
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
604

605
606
607
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
608
609
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
610
        
611
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
612
613
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
614
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
615
        
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
        try:
            ### Verify that same file is not present twice
            filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
            filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
            for filename in filename_jlist1:
                if filename in filename_jlist2:
                    raise( "Error, duplicate file name (in distributions) ")

            ### Append distributions of each files
            obj.d["distributions"] = {}
            obj.d["distributions"]["repertoires"] = {}
            obj.d["distributions"]["keys"]        = ["clones", "reads"]
            obj.d["distributions"]["filters"]     = {}
            obj.d["distributions"]["categories"]  = {}
            for filename in filename_jlist1:
                obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
            for filename in filename_jlist2:
                obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
        except:
            pass

637
638
        return obj
        
639
640
641
642
643
644
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
645
        self.d["clones"] += other.d["clones"]
646
647
648
649
650
651
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
652
    ### 
653
    def fuseWindows(self, w1, w2, l1, l2) :
654
        #store data in dict with "id" as key
655
656
        dico1 = {}
        for i in range(len(w1)) :
657
            dico1[ w1[i].d["id"] ] = w1[i]
658
659
660

        dico2 = {}
        for i in range(len(w2)) :
661
            dico2[ w2[i].d["id"] ] = w2[i]
662

663
        #concat data with the same key ("id")
664
665
666
667
668
669
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
670
                w=Window(l2)
671
672
673
674
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
675
                w=Window(l1)
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
693
694
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
695
696

        w=[]
697
698
699

        others = OtherWindows(nb_points)

700
701
702
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
703
            #else:
704
                #others += clone
705

706
        self.d["clones"] = w #+ list(others) 
707

Mathieu Giraud's avatar
Mathieu Giraud committed
708
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
709
710
        return self
        
711
    def load_clntab(self, file_path, *args, **kwargs):
712
713
714
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
715
        self.d["samples"].d["original_names"] = [file_path]
716
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
732
733
                w.d["seg"] = {}
                
734
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
735
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
736
737
                s = int(tab["sequence.size"])
                total_size += s
738
                w.d["reads"] = [ s ]
739

740
741
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
742
                w.d["sequence"] = tab["sequence.raw nt seq"]
743
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
744
                if (tab["sequence.D-GENE and allele"] != "") :
745
746
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
747
748
749
750

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
751
                
752
                if position >= 0:
753
754
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
755
                else:
756
757
758
759
760
761
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
762
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
763
                    
764
                listw.append((w , w.d["reads"][0]))
765
766

                raw_clonotype = tab.get("clonotype")
767
768
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
785
            self.d["clones"].append(listw[index][0])
786
        
787
788
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
789
790
791
792

        
    def toJson(self, obj):
        '''Serializer for json module'''
793
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
794
795
796
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
797
                
798
799
            return result
            raise TypeError(repr(obj) + " fail !") 
800
        
Marc Duez's avatar
Marc Duez committed
801
        else:
802
803
804
805
806
807
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
808
809
810

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
811
        if "samples" in obj_dict:
812
            obj = ListWindows()
813
            obj.d=obj_dict
814
815
            return obj

816
        if "id" in obj_dict:
817
            obj = Window(1)
818
            obj.d=obj_dict
819
            return obj
Marc Duez's avatar
Marc Duez committed
820
        
821
822
        if "total" in obj_dict:
            obj = Reads()
823
            obj.d=obj_dict
824
825
            return obj
            
Marc Duez's avatar
Marc Duez committed
826
827
        if "original_names" in obj_dict:
            obj = Samples()
828
            obj.d=obj_dict
829
            return obj
Marc Duez's avatar
Marc Duez committed
830
            
831
        return obj_dict
832
        
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
    def get_filename_pos(self, filename):
        """ filename is the key of distributions repertoires """
        return self.d["samples"].d["original_names"].index(filename)


    # ========================= #
    #  Distributions computing  #
    # ========================= #
    def init_distrib(self, list_distrib):
        """ Create distributions structures """
        self.d["distributions"] = {}
        self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
        self.d["distributions"]["keys"]        = ["clones", "reads"]
        self.d["distributions"]["filters"]     = []
        # Warning, will fail of there are same original_names; fixme

        nb_sample = self.d["samples"].d["number"]
        # prefill distribution for each file of jlist
        for filename in self.d["samples"].d["original_names"]:
            for distrib in list_distrib:
                self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
        return


    def compute_distribution(self, list_distrib):
        """ list_distrib is a list of distrib to compute """ 
        for filename in self.d["samples"].d["original_names"]:
            timepoint = self.get_filename_pos(filename)
            for clone in self.d["clones"]:
                if clone.d["reads"][timepoint] != 0:
                    for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
                        distrib      = list_distrib[distrib_pos]
                        clone_values = clone.get_axes_values( distrib, timepoint )
                        nb_reads     = clone.get_reads(self.get_filename_pos(filename))
                        self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
        return


    def add_clone_values(self, filename, distrib_pos, values, nb_reads):
        """ Add value of a clone to a specific distribution """
        obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
        obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
        self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
        return

    def recursive_add( self, obj, values, nb_reads):
        """ Add by recusivity the newer value to existing value; return the obj """
        if len(values) > 1:
            # Should increase for one depth
            nvalues = values[1:]
            obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
        else:
            # last depth
            if not obj:
                obj = defaultdict(lambda: False)
            if not obj[values[0]]:
                obj[values[0]] = [0, 0]

            obj[values[0]][0] += 1
            obj[values[0]][1] += nb_reads
        return obj

    def save_distributions(self, foname):
        # Compute aggreg just before export
        fo = open(foname, "w")
        json.dump(self.d["distributions"], fo, indent=2)
        fo.close()
        return
901
902
903
904
905



### some data used for test
w1 = Window(1)
906
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
907
w2 = Window(1)
908
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
909
w3 = Window(1)
910
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
911
w4 = Window(1)
912
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
913
914
915


w5 = Window(1)
916
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
917
w6 = Window(1)
918
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
919

920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
seg_w7 = {
  "germline": "TRA",
  "id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
  "name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
  "reads": [
    16
  ],
  "seg": {
    "3": {
      "delLeft": 5,
      "name": "TRAJ40*01",
      "start": 112
    },
    "5": {
      "delRight": 0,
      "name": "TRAV17*01",
      "stop": 105
    },
    "N": 6,
    "cdr3": {
      "aa": "ATDAG#SGTYKYI",
      "start": 96,
      "stop": 133
    }
  },
  "sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
  "top": 26
}

949
lw1 = ListWindows()
950
lw1.d["timestamp"] = 'ts'
951
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
952
953
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
954
955
lw1.d["diversity"] = Diversity()

956
957

w7 = Window(1)
958
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
959
w8 = Window(1)
960
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
961
962

lw2 = ListWindows()
963
lw2.d["timestamp"] = 'ts'
964
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
965
966
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
967
lw2.d["diversity"] = Diversity()
968
969

    
970
971
972
973
974
def exec_command(command, directory, input_file):
    '''
    Execute the command `command` from the directory
    `directory`. The executable must exist in
    this directory. No path changes are allowed in `command`.
975
    
976
977
978
979
980
981
982
983
984
985
986
987
988
    Returns the output filename (a .vidjil). 
    '''
    assert (not os.path.sep in command), "No {} allowed in the command name".format(os.path.sep)
    ff = tempfile.NamedTemporaryFile(suffix='.vidjil', delete=False)

    basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
    command_fullpath = basedir+os.path.sep+directory+os.path.sep+command
    com = '%s %s %s' % (quote(command_fullpath), quote(os.path.abspath(input_file)), ff.name)
    print(com)
    os.system(com)
    print()
    return ff.name

989
990
991

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
992
    print("#", ' '.join(sys.argv))
993
994
995
996
997
998
999

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
1000
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
1001
1002
1003
1004
1005
1006
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
1007
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
1008
    
1009
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
1010
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
1011
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
1012

Marc Duez's avatar
Marc Duez committed
1013
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
1014
1015
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

1016
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
1017

Mathieu Giraud's avatar
Mathieu Giraud committed
1018
    group_options.add_argument('--pre', type=str,help='pre-process program (launched on each input .vidjil file) (needs defs.PRE_PROCESS_DIR)')
1019

1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
    LISTE_D   = []
    LIST_AXES = ["germline", # "top", # "name"
        "seg5", "seg4", "seg3",
        "lenSeqConsensus", "lenSeqAverage", "GCContent", "coverage",
        "lenSeq", # "evalue", l'arrondir ?
        "seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg3_delLeft",
        "insert_53", "insert_54", "insert_43",
        #"seg5_stop", "seg3_start", "seg4_stop", "seg4_start",
        "lenCDR3",   # "cdr3_stop", "cdr3_start", 
        "productive", #"junction_start", "junction_stop",
        "rearrangement", "complete",
    ]
    for axe1 in LIST_AXES:
        LISTE_D.append([axe1])
        for axe2 in LIST_AXES:
            if axe1 != axe2:
                LISTE_D.append([axe1, axe2])
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1050
1051
    print("### fuse.py -- " + DESCRIPTION)
    print()
1052

1053
1054
1055
1056
1057
1058
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

1059
1060
1061
1062
    if args.pre:
        print("Pre-processing files...")
        pre_processed_files = []
        for f in files:
1063
1064
            out_name = exec_command(args.pre, PRE_PROCESS_DIR, f)
            pre_processed_files.append(out_name)
1065
1066
        files = pre_processed_files

Marc Duez's avatar
Marc Duez committed
1067
1068
    #filtre
    f = []
1069

1070
1071
1072
1073
1074
    if args.ijson:
        from vidjilparser import VidjilParser
        vparser = VidjilParser()
        vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)

1075
    for path_name in files:
1076
1077
1078
        if args.ijson:
            json_clones = vparser.extract(path_name)
            clones = json.loads(json_clones)
1079
1080
            if clones["clones"] is not None:
                f += [c['id'] for c in clones["clones"]]
1081
1082
1083
1084
1085
        else:
            jlist = ListWindows()
            jlist.load(path_name, args.pipeline)
            f += jlist.getTop(args.top)

Marc Duez's avatar
Marc Duez committed
1086
1087
    f = sorted(set(f))
    
1088
1089
1090
1091
1092
    if args.ijson:
        vparser.reset()
        vparser.addPrefix('')
        vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)

1093
    if args.multi:
1094
        for path_name in files:
1095
            jlist = ListWindows()
1096
1097
1098
1099
1100
1101
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
1102
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1103
            print("\t", jlist, end=' ')
1104
1105
1106
1107
1108

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
1109
                
Mathieu Giraud's avatar
Mathieu Giraud committed
1110
            print('\t==> merge to', jlist_fused)
1111
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
1112
        
1113
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
1114
        print("### Read and merge input files")
1115
        for path_name in files:
1116
            jlist = ListWindows()
1117
1118
1119
1120
1121
1122
1123
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
                jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
1124

1125
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1126
            print("\t", jlist, end=' ')
1127
1128
1129
1130
1131
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
1132
            
Mathieu Giraud's avatar
Mathieu Giraud committed
1133
            print('\t==> merge to', jlist_fused)
1134

1135
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
1136
1137
        print()
        print("### Select point names")
1138
        l = jlist_fused.d["samples"].d["original_names"]
1139
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
1140
1141
        print("  <==", l)
        print("  ==>", ll)
1142
        jlist_fused.d["samples"].d["names"] = ll
1143
    
Mathieu Giraud's avatar
Mathieu Giraud committed
1144
    print()
1145
    if not args.multi:
1146
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
1147
1148
    print("\t", jlist_fused) 
    print()
1149

1150
    #compute similarity matrix