fuse.py 40.5 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43
from pipes import quote
44

45
FUSE_VERSION = "vidjil fuse"
46

47
TOOL_SIMILARITY = "../algo/tools/similarity"
48
SIMILARITY_LIMIT = 100
49

50 51
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

52 53 54
####

class Window:
55
    # Should be renamed "Clone"
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
Thonier Florian's avatar
Thonier Florian committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    >>> w1.get_values("name")
    '?'
    
    >>> w1.get_values("top")
    3
  
    >>> w7   = Window(1)
    >>> w7.d = seg_w7
    >>> w7.d["seg"]["5"]["name"]
    'TRAV17*01'
      
    >>> w7.get_values("name")
    'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
    
    >>> w7.get_values("seg5")
    'TRAV17*01'
90 91 92 93 94
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
95
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
96
        self.d["top"] = sys.maxsize
97
        self.d["reads"] = []
98
        for i in range(size):
99
            self.d["reads"].append(0)
100 101 102
        
        
    ### 
103 104
    def __iadd__(self, other):
        ### Not used now
105
        """Add other.reads to self.reads in-place, without extending lists"""
106

107 108
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
109 110 111

        return self
        
112
    def __add__(self, other):
113
        """Concat two windows, extending lists such as 'reads'"""
114
        #data we don't need to duplicate
115
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
116 117
        obj = Window(1)
        
118 119 120 121 122
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
123 124 125 126
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
127
                    
128 129 130 131 132 133 134 135 136
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

137 138
        return obj
        
139 140
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
141

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
    def get_reads(self, t=-1):
        if t== -1:
            return self.d["reads"]
        else:
            return self.d["reads"][t]

    def is_segmented(self):
        if not "seg" in self.d.keys():
            return False
        return True


    def get_axes_values(self, axes, timepoint):
        """Return the values of given axes for this clone at a gvien timepoint"""
        values = []
        for axis in axes:
            values.append(self.get_values(axis, timepoint))
        return values


    def get_values(self, axis, timepoint=0):
        """Return the value of an axis for this clone at a given timepoint"""

        axes = {
            "top":      ["top"],
            "germline": ["germline"],
            "name":     ["name"],
            "seg5":     ["seg",  "5", "name"],
            "seg4":     ["seg",  "4", "name"],
            "seg4a":    ["seg", "4a", "name"],
            "seg4b":    ["seg", "4b", "name"],
            "seg3":     ["seg",  "3", "name"],
            "evalue":   ["evalue","val"],
            "seg5_delRight": ["seg","5","delRight"],
            "seg3_delLeft":  ["seg","3","delLeft"],
            "seg4_delRight": ["seg","4","delRight"],
            "seg3_delLeft":  ["seg","4","delLeft"],
            ### Positions
            "seg5_stop":  ["seg","5","stop"],
            "seg3_start": ["seg","3","start"],
            "seg4_stop":  ["seg","4","stop"],
            "seg4_start": ["seg","4","start"],
            "cdr3_stop":  ["seg","cdr3","stop"],
            "cdr3_start": ["seg","cdr3","start"],
            "junction_stop":  ["seg","junction","stop"],
            "junction_start": ["seg","junction","start"],
            "productive":     ["seg","junction","productive"],
            "lenSeqAverage" : ["_average_read_length"]
        }

        ### length
        axes_length = {
            "insert_53": ["seg3_start", "seg5_stop", -1],
            "insert_54": ["seg4_start", "seg5_stop", -1],
            "insert_43": ["seg3_start", "seg4_stop", -1],
            "lenCDR3": ["cdr3_stop", "cdr3_start", 0],
            "lenJunction": ["junction_stop", "junction_start", 0]
        }

        try:
            if axis in axes.keys():
                path  = axes[axis]
                depth = 0
                value = self.d
                if path[0] == "seg" and not self.is_segmented():
                    return "?"

                while depth != len(path):
                    value  = value[path[depth]]
                    depth += 1
                if type(value) is list:
                    # In these cases, should be a list of value at different timepoint
                    return value[timepoint]
                return value

            ############################################################
            ### These axes need to make some computation on clone fields
            if axis == "lenSeq":
                return len(self.d["sequence"])

            if axis in axes_length.keys():
                path = axes_length[axis]
                if path[0] == "seg" and not self.is_segmented():
                    return "not_segmented"
                return self.get_values(path[0]) - self.get_values(path[1]) + path[2]


            if axis == "GCContent":
                return self.getGCcontent()

            # FIXME: need to get the rigth position value
            # If the clone have multiple sample, need to select the current.
            # Should not apply if executed from the vidjil server, without previous fused files
            if axis == "lenSeqConsensus":
                return self.get_values("lenSeqAverage")/ self.get_values("lenSeq")
            if axis == "coverage":
                return round(self.d["_coverage"][0]*100, 2)

            if axis == "complete":
                germline = self.get_values("germline")
                if "+" in germline:
                    return False
                elif germline in ["unexpected", "IKZF1", "ERG"]:
                    # maybe more specific later ?
                    return germline
                elif germline in ["IGH","IGK","IGL","TRA","TRB","TRD","TRG"]:
                    # List can be set if new complete germline is defined
                    return True
                else:
                    return "unexpected"
            if axis == "rearrangement":
                return self.get_rearrangement()

            return "unknow_axis"
        except:
            return "?"

    def getGCcontent(self):
        """ Compute and retur nthe GC ratio of a sequence """
        if len(self.d["sequence"]) == 0:
            return "?"

        gc = 0
        for nt in self.d["sequence"]:
            if nt in "GgCc":
                gc += 1

        GCContent = gc / len(self.d["sequence"])
        # return rounded value for better association of clones
        return round( GCContent, 2)


    def get_rearrangement( self):
        """Get the rearrangement type of this clone"""
        if not self.is_segmented():
            return "?"

        complete = self.get_values("complete")
        seg_vals = {
            "5":  self.get_values("seg5"),
            "4a": self.get_values("seg4a"),
            "4":  self.get_values("seg4"),
            "4b": self.get_values("seg4b"),
            "3":  self.get_values("seg3")
        }

        if complete == True:
            if seg_vals["4a"] == "?" and seg_vals["4"] == "?" and seg_vals["4b"] == "?":
                return "VJ"
            elif seg_vals["4a"] == "?" and seg_vals["4b"] == "?":
                return "VDJ"
            elif seg_vals["4a"] == "?" or seg_vals["4b"] == "?":
                return "VDDJ"
            else:
                return "VDDDJ"

        elif complete == False:
            # cas compliqué, DJ etant un 53, DD aussi et VD aussi
            if "V" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "VD"
            elif "D" in seg_vals["5"] and "J" in seg_vals["3"]:
                return "DJ"
            elif "D" in seg_vals["5"] and "D" in seg_vals["3"]:
                return "DD"

        elif complete in ["unexpected", "IKZF1", "ERG"]:
            return complete
        else:
            return "?"


313
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
314
        reads = self.d["reads"][point]
315
        ratio_germline = float(reads)/base_germline
316
        ratio = float(reads)/base
317
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
318 319
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
320

321 322
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
323
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
324
        
Marc Duez's avatar
Marc Duez committed
325
class Samples: 
326

Marc Duez's avatar
Marc Duez committed
327 328
    def __init__(self):
        self.d={}
329
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
330 331 332 333
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
334

335 336 337 338 339
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

340
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
341 342
        
        return obj
343

344 345 346
    def __str__(self):
        return "<Samples: %s>" % self.d

347 348
class Diversity: 

349 350
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

351 352 353
    def __init__(self, data=None):
        self.d={}

354 355 356 357 358
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
359

360 361 362
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
363 364 365 366 367
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
368 369 370 371 372 373 374
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
375
        self.d['distribution'] = {}
376 377 378

    def __add__(self, other):
        obj=Reads()
379 380 381 382 383

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
384 385 386 387
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
388 389 390 391
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
392 393 394

    def __str__(self):
        return "<Reads: %s>" % self.d
395
        
396 397 398 399
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

400 401
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
402 403 404 405 406 407 408 409 410 411
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

412
        for i, s in enumerate(window.d["reads"]):
413 414 415 416 417
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
418 419
            
        # print window, '-->', self.sizes
420 421 422 423 424 425 426
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
427
            w.d['reads'] = self.sizes[r]
428 429 430
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
431
            print('  --[others]-->', w)
432
            yield w
433
        
434
class ListWindows(VidjilJson):
435 436 437
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
438
    <ListWindows: [25] 2>
439 440 441 442
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
443
    <ListWindows: [34] 2>
444 445 446
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
447
    >>> lw3 = lw1 + lw2
448
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
449
    <ListWindows: [25, 34] 3>
450 451 452
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
453 454 455 456 457 458 459 460 461 462
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

463 464 465 466 467
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
468 469
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
470 471
        self.d["clones"] = []
        self.d["clusters"] = []
472
        self.d["germlines"] = {}
473
        
474 475 476 477
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
478
    def __str__(self):
479 480 481 482 483 484 485 486 487 488 489 490 491 492
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
493 494 495

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
496
        print(self)
497
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
498
            print(clone)
499
        
500 501
    ### compute statistics about clones
    def build_stat(self):
502
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
503
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
504

505 506
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
507 508 509 510 511 512
                if self.d['reads'].d['segmented'][i] > 0:
                    for r in range(len(ranges)):
                        if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
                            break
                    result[r][i] += s

513
        for r in range(len(ranges)):
514 515
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
516 517 518 519
            
        #TODO V/D/J distrib and more 
        
        
520 521 522
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
523
        print("==>", output)
524 525
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
526

527
    def load(self, file_path, *args, **kwargs):
528 529 530 531
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
532 533 534 535 536 537

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
538 539 540
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
541

542 543 544 545
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
546
        
547 548
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

573 574 575
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
576
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
577
                print("[%s]" % f)
578 579 580

        else:
            f = file_path
581
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
582
                print()
583

Marc Duez's avatar
Marc Duez committed
584
        time = os.path.getmtime(file_path)
585
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
586

587 588 589
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
590

Marc Duez's avatar
Marc Duez committed
591 592 593
    def getTop(self, top):
        result = []
        
594 595 596
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
597 598 599 600 601
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
602
        reverseList = {}
603 604
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
605 606 607
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
608
        
Marc Duez's avatar
Marc Duez committed
609
        for i in filteredList:
610
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
611
        
612
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
613
        
614 615 616 617
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
618 619
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
620

621 622 623
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
624 625
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
626
        
627
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
628 629
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
630
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
631
        
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
        try:
            ### Verify that same file is not present twice
            filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
            filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
            for filename in filename_jlist1:
                if filename in filename_jlist2:
                    raise( "Error, duplicate file name (in distributions) ")

            ### Append distributions of each files
            obj.d["distributions"] = {}
            obj.d["distributions"]["repertoires"] = {}
            obj.d["distributions"]["keys"]        = ["clones", "reads"]
            obj.d["distributions"]["filters"]     = {}
            obj.d["distributions"]["categories"]  = {}
            for filename in filename_jlist1:
                obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
            for filename in filename_jlist2:
                obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
        except:
            pass

653 654
        return obj
        
655 656 657 658 659 660
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
661
        self.d["clones"] += other.d["clones"]
662 663 664 665 666 667
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
668
    ### 
669
    def fuseWindows(self, w1, w2, l1, l2) :
670
        #store data in dict with "id" as key
671 672
        dico1 = {}
        for i in range(len(w1)) :
673
            dico1[ w1[i].d["id"] ] = w1[i]
674 675 676

        dico2 = {}
        for i in range(len(w2)) :
677
            dico2[ w2[i].d["id"] ] = w2[i]
678

679
        #concat data with the same key ("id")
680 681 682 683 684 685
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
686
                w=Window(l2)
687 688 689 690
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
691
                w=Window(l1)
692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
709 710
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
711 712

        w=[]
713 714 715

        others = OtherWindows(nb_points)

716 717 718
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
719
            #else:
720
                #others += clone
721

722
        self.d["clones"] = w #+ list(others) 
723

Mathieu Giraud's avatar
Mathieu Giraud committed
724
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
725 726
        return self
        
727
    def load_clntab(self, file_path, *args, **kwargs):
728 729 730
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
731
        self.d["samples"].d["original_names"] = [file_path]
732
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
748 749
                w.d["seg"] = {}
                
750
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
751
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
752 753
                s = int(tab["sequence.size"])
                total_size += s
754
                w.d["reads"] = [ s ]
755

756 757
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
758
                w.d["sequence"] = tab["sequence.raw nt seq"]
759
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
760
                if (tab["sequence.D-GENE and allele"] != "") :
761 762
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
763 764 765 766

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
767
                
768
                if position >= 0:
769 770
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
771
                else:
772 773 774 775 776 777
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
778
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
779
                    
780
                listw.append((w , w.d["reads"][0]))
781 782

                raw_clonotype = tab.get("clonotype")
783 784
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
801
            self.d["clones"].append(listw[index][0])
802
        
803 804
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
805 806 807 808

        
    def toJson(self, obj):
        '''Serializer for json module'''
809
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
810 811 812
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
813
                
814 815
            return result
            raise TypeError(repr(obj) + " fail !") 
816
        
Marc Duez's avatar
Marc Duez committed
817
        else:
818 819 820 821 822 823
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
824 825 826

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
827
        if "samples" in obj_dict:
828
            obj = ListWindows()
829
            obj.d=obj_dict
830 831
            return obj

832
        if "id" in obj_dict:
833
            obj = Window(1)
834
            obj.d=obj_dict
835
            return obj
Marc Duez's avatar
Marc Duez committed
836
        
837 838
        if "total" in obj_dict:
            obj = Reads()
839
            obj.d=obj_dict
840 841
            return obj
            
Marc Duez's avatar
Marc Duez committed
842 843
        if "original_names" in obj_dict:
            obj = Samples()
844
            obj.d=obj_dict
845
            return obj
Marc Duez's avatar
Marc Duez committed
846
            
847
        return obj_dict
848
        
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
    def get_filename_pos(self, filename):
        """ filename is the key of distributions repertoires """
        return self.d["samples"].d["original_names"].index(filename)


    # ========================= #
    #  Distributions computing  #
    # ========================= #
    def init_distrib(self, list_distrib):
        """ Create distributions structures """
        self.d["distributions"] = {}
        self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
        self.d["distributions"]["keys"]        = ["clones", "reads"]
        self.d["distributions"]["filters"]     = []
        # Warning, will fail of there are same original_names; fixme

        nb_sample = self.d["samples"].d["number"]
        # prefill distribution for each file of jlist
        for filename in self.d["samples"].d["original_names"]:
            for distrib in list_distrib:
                self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
        return


    def compute_distribution(self, list_distrib):
        """ list_distrib is a list of distrib to compute """ 
        for filename in self.d["samples"].d["original_names"]:
            timepoint = self.get_filename_pos(filename)
            for clone in self.d["clones"]:
                if clone.d["reads"][timepoint] != 0:
                    for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
                        distrib      = list_distrib[distrib_pos]
                        clone_values = clone.get_axes_values( distrib, timepoint )
                        nb_reads     = clone.get_reads(self.get_filename_pos(filename))
                        self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
        return


    def add_clone_values(self, filename, distrib_pos, values, nb_reads):
        """ Add value of a clone to a specific distribution """
        obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
        obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
        self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
        return

    def recursive_add( self, obj, values, nb_reads):
        """ Add by recusivity the newer value to existing value; return the obj """
        if len(values) > 1:
            # Should increase for one depth
            nvalues = values[1:]
            obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
        else:
            # last depth
            if not obj:
                obj = defaultdict(lambda: False)
            if not obj[values[0]]:
                obj[values[0]] = [0, 0]

            obj[values[0]][0] += 1
            obj[values[0]][1] += nb_reads
        return obj

    def save_distributions(self, foname):
        # Compute aggreg just before export
        fo = open(foname, "w")
914
        json.dump(self.d["distributions"], fo, indent=2, sort_keys=True)
915 916
        fo.close()
        return
917 918 919 920 921



### some data used for test
w1 = Window(1)
922
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
923
w2 = Window(1)
924
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
925
w3 = Window(1)
926
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
927
w4 = Window(1)
928
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
929 930 931


w5 = Window(1)
932
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
933
w6 = Window(1)
934
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
935

936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964
seg_w7 = {
  "germline": "TRA",
  "id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
  "name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
  "reads": [
    16
  ],
  "seg": {
    "3": {
      "delLeft": 5,
      "name": "TRAJ40*01",
      "start": 112
    },
    "5": {
      "delRight": 0,
      "name": "TRAV17*01",
      "stop": 105
    },
    "N": 6,
    "cdr3": {
      "aa": "ATDAG#SGTYKYI",
      "start": 96,
      "stop": 133
    }
  },
  "sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
  "top": 26
}

965
lw1 = ListWindows()
966
lw1.d["timestamp"] = 'ts'
967
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
968 969
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
970 971
lw1.d["diversity"] = Diversity()

972 973

w7 = Window(1)
974
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
975
w8 = Window(1)
976
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
977 978

lw2 = ListWindows()
979
lw2.d["timestamp"] = 'ts'
980
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
981 982
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
983
lw2.d["diversity"] = Diversity()
984 985

    
986 987 988 989 990
def exec_command(command, directory, input_file):
    '''
    Execute the command `command` from the directory
    `directory`. The executable must exist in
    this directory. No path changes are allowed in `command`.
991
    
992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
    Returns the output filename (a .vidjil). 
    '''
    assert (not os.path.sep in command), "No {} allowed in the command name".format(os.path.sep)
    ff = tempfile.NamedTemporaryFile(suffix='.vidjil', delete=False)

    basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
    command_fullpath = basedir+os.path.sep+directory+os.path.sep+command
    com = '%s %s %s' % (quote(command_fullpath), quote(os.path.abspath(input_file)), ff.name)
    print(com)
    os.system(com)
    print()
    return ff.name

1005 1006 1007

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
1008
    print("#", ' '.join(sys.argv))
1009 1010 1011 1012 1013 1014 1015

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
1016
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
1017 1018 1019 1020 1021 1022
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
1023
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
1024 1025
    group_options.add_argument('--distributions', '-d', action='store_true', help='compute distributions')
    group_options.add_argument('--only_disributions', '-D', action='store_true', help='export only distributions')
1026
    
1027
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
1028
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')