Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

fuse.py 57.5 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29
from __future__ import division
30

31
import check_python_version
32
import sys
33 34
import json
import argparse
35
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
36
import os
Marc Duez's avatar
Marc Duez committed
37
import datetime
38
import subprocess
39
import tempfile
40
import math
Mathieu Giraud's avatar
Mathieu Giraud committed
41
import gzip
42
from operator import itemgetter, le
43
from utils import *
44
from defs import *
45
from collections import defaultdict
46
from pipes import quote
47

48
FUSE_VERSION = "vidjil fuse"
49

50
TOOL_SIMILARITY = "../algo/tools/similarity"
51
SIMILARITY_LIMIT = 100
52

53 54
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

55 56 57 58 59 60 61 62 63 64
AVAILABLE_AXES = [
    "top", "germline", "name",
    "seg5", "seg4", "seg4a", "seg4b", "seg3",
    "evalue", "lenSeqAverage", "lenCDR3", "lenJunction",
    "seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg4_delLeft",
    "seg5_stop", "seg3_start", "seg4_stop", "seg4_start", "cdr3_stop", "cdr3_start",
    "junction_stop", "junction_start", "productive",
    "insert_53", "insert_54", "insert_43",
    "evalue", "evalue_left", "evalue_right",
]
65 66
GET_UNKNOW_AXIS = "unknow_axis"
UNKNOWN_VALUE   = ""
67

Mathieu Giraud's avatar
Mathieu Giraud committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
def generic_open(path, mode='r', verbose=False):
    '''
    Open a file.
    If 'r', possibly open .gz compressed files.
    '''
    if verbose:
        print(" <==", path, "\t", end=' ')

    if 'r' in mode:
        try:
            f = gzip.open(path, mode)
            f.read(1)
            f.seek(0)
            return f
        except IOError:
            pass

    return open(path, mode)
86 87

class Window:
88
    # Should be renamed "Clone"
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
Thonier Florian's avatar
Thonier Florian committed
107 108
    >>> w1.get_values("name")
    '?'
109
    
Thonier Florian's avatar
Thonier Florian committed
110 111 112 113 114 115 116
    >>> w1.get_values("top")
    3
  
    >>> w7   = Window(1)
    >>> w7.d = seg_w7
    >>> w7.d["seg"]["5"]["name"]
    'TRAV17*01'
117 118 119 120 121 122 123 124

    >>> w7.get_values("reads")
    16

    >>> w7.get_values("top")
    26
    >>> w7.get_values("germline")
    'TRA'
Thonier Florian's avatar
Thonier Florian committed
125 126 127 128
    >>> w7.get_values("name")
    'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
    >>> w7.get_values("seg5")
    'TRAV17*01'
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
    >>> w7.get_values("seg4a")
    '?'
    >>> w7.get_values("seg4")
    '?'
    >>> w7.get_values("seg4b")
    '?'
    >>> w7.get_values("seg3")
    'TRAJ40*01'
    >>> w7.get_values("evalue")
    '1.180765e-43'
    >>> w7.get_values("evalue_left")
    '1.296443e-47'
    >>> w7.get_values("evalue_right")
    '1.180635e-43'
    >>> w7.get_values("seg5_delRight")
    0
    >>> w7.get_values("seg3_delLeft")
    5
    >>> w7.get_values("seg4_delRight")
    '?'
    >>> w7.get_values("seg4_delLeft")
    '?'
    >>> w7.get_values("seg5_stop")
    105
    >>> w7.get_values("seg3_start")
    112
    >>> w7.get_values("seg4_stop")
    '?'
    >>> w7.get_values("seg4_start")
    '?'
    >>> w7.get_values("cdr3_start")
    96
    >>> w7.get_values("cdr3_stop")
    133
    >>> w7.get_values("junction_start")
    93
    >>> w7.get_values("junction_stop")
    136
    >>> w7.get_values("productive")
    True
    >>> w7.get_values("unavailable_axis")
    'unknow_axis'
171 172 173 174 175 176
    >>> w7.get_values("lenSeqAverage")
    168.0
    >>> w7.get_values("lenSeqAverage", 0)
    168.0
    >>> w7.get_values("lenSeqAverage", 1)
    169.0
177
    '''
178 179


180 181 182
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
183
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
184
        self.d["top"] = sys.maxsize
185
        self.d["reads"] = []
186
        for i in range(size):
187
            self.d["reads"].append(0)
188 189 190
        
        
    ### 
191 192
    def __iadd__(self, other):
        ### Not used now
193
        """Add other.reads to self.reads in-place, without extending lists"""
194

195 196
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
197 198 199

        return self
        
200
    def __add__(self, other):
201
        """Concat two windows, extending lists such as 'reads'"""
202
        #data we don't need to duplicate
203
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
204 205
        obj = Window(1)
        
206 207 208 209 210
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
211 212 213 214
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
215
                    
216 217 218 219 220 221 222 223 224
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

225 226 227 228 229 230
        # !! No name field if fuse with an empty clone
        if "name" in self.d and "name" in other.d and self.d["name"] != other.d["name"]:
            if obj.d["name"] == self.d["name"]:
                name = other.d["name"]
            else:
                name = self.d["name"]
flothoni's avatar
flothoni committed
231 232 233

            msg = "Clone have different names between samples: %s" % name
            obj.addWarning(code="W81", msg=msg, level="warn")
234 235
        return obj
        
flothoni's avatar
flothoni committed
236 237 238 239 240 241 242
    def addWarning(self, code, msg, level):
        # init warn field if not already present
        if not "warn" in self.d:
            self.d["warn"] = []
        self.d["warn"].append( {"code":code, "msg":msg, "level": level})
        return

243 244
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
245

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
    def get_reads(self, t=-1):
        if t== -1:
            return self.d["reads"]
        else:
            return self.d["reads"][t]

    def is_segmented(self):
        if not "seg" in self.d.keys():
            return False
        return True


    def get_axes_values(self, axes, timepoint):
        """Return the values of given axes for this clone at a gvien timepoint"""
        values = []
        for axis in axes:
            values.append(self.get_values(axis, timepoint))
        return values


    def get_values(self, axis, timepoint=0):
        """Return the value of an axis for this clone at a given timepoint"""

        axes = {
270
            "id":       ["id"],
271 272 273
            "top":      ["top"],
            "germline": ["germline"],
            "name":     ["name"],
274
            "reads":    ["reads"],
275 276 277 278 279
            "seg5":     ["seg",  "5", "name"],
            "seg4":     ["seg",  "4", "name"],
            "seg4a":    ["seg", "4a", "name"],
            "seg4b":    ["seg", "4b", "name"],
            "seg3":     ["seg",  "3", "name"],
280 281 282
            "evalue":   ["seg", "evalue","val"],
            "evalue_left":   ["seg", "evalue_left","val"],
            "evalue_right":  ["seg", "evalue_right","val"],
283 284 285
            "seg5_delRight": ["seg","5","delRight"],
            "seg3_delLeft":  ["seg","3","delLeft"],
            "seg4_delRight": ["seg","4","delRight"],
286
            "seg4_delLeft":  ["seg","4","delLeft"],
287 288 289 290 291 292 293 294 295 296
            ### Positions
            "seg5_stop":  ["seg","5","stop"],
            "seg3_start": ["seg","3","start"],
            "seg4_stop":  ["seg","4","stop"],
            "seg4_start": ["seg","4","start"],
            "cdr3_stop":  ["seg","cdr3","stop"],
            "cdr3_start": ["seg","cdr3","start"],
            "junction_stop":  ["seg","junction","stop"],
            "junction_start": ["seg","junction","start"],
            "productive":     ["seg","junction","productive"],
297 298 299
            "lenSeqAverage" : ["_average_read_length"],
            "warnings": ["warn"],
            "sequence": ["sequence"]
300 301 302 303 304 305 306 307 308 309 310
        }

        ### length
        axes_length = {
            "insert_53": ["seg3_start", "seg5_stop", -1],
            "insert_54": ["seg4_start", "seg5_stop", -1],
            "insert_43": ["seg3_start", "seg4_stop", -1],
            "lenCDR3": ["cdr3_stop", "cdr3_start", 0],
            "lenJunction": ["junction_stop", "junction_start", 0]
        }

311 312 313 314
        axes_rounded = {
            "lenSeqAverage" : 1.0
        }

315 316 317 318 319 320 321 322 323 324 325
        try:
            if axis in axes.keys():
                path  = axes[axis]
                depth = 0
                value = self.d
                if path[0] == "seg" and not self.is_segmented():
                    return "?"

                while depth != len(path):
                    value  = value[path[depth]]
                    depth += 1
326 327 328 329
                if axis in axes_rounded.keys():
                    value = self.round_axis(value, axes_rounded[axis])


330 331 332
                if type(value) is list:
                    # In these cases, should be a list of value at different timepoint
                    return value[timepoint]
333

334 335
                return value

336
            return GET_UNKNOW_AXIS
337 338 339
        except:
            return "?"

340 341 342 343 344 345 346 347 348
    def round_axis(self, value, round_set):
        if isinstance(value, list):
            for i in range(len(value)):
                value[i] = self.round_axis(value[i], round_set)
        else:
            value = round(float(value) / round_set) * round_set
        return value


349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
    def toCSV(self, cols= [], time=0, jlist = False):
        """ Return a list of values of the clone added from a list of waited columns """
        list_values = []


        airr_to_axes = {
            "locus":               "germline",
            "v_call":              "seg5",
            "d_call":              "seg4",
            "j_call":              "seg3",
            "v_alignment_end":     "seg5_stop",
            "d_alignment_start":   "seg4_start",
            "d_alignment_end":     "seg4_stop",
            "j_alignment_start":   "seg3_start",
            "productive":          "productive",
            "cdr3_start":          "cdr3_start",
            "cdr3_end":            "cdr3_stop",
            "sequence_id":         "id",
            "sequence":            "sequence",
            ## Not implemented in get_values
            # For x_cigar, datananot available in clone
            "junction":  "?",
            "cdr3_aa":   "?",
            "rev_comp":  "?",
            "v_cigar":   "?",
            "d_cigar":   "?",
            "j_cigar":   "?",
            "sequence_alignment": "?",
377 378
            "germline_alignment": "?",
            "duplicate_count":    "reads"
379 380
        }

381
        airr_computed =  ["ratio_segmented", "ratio_locus", "filename", "warnings"]
382 383 384 385

        for col in cols:    

            if col in airr_computed:
386
                if col == "ratio_locus":
387
                    germline = self.get_values("germline")
388
                    value = float(self.get_values("reads", timepoint=time) ) / jlist.d["reads"].d["germline"][germline][time]
389 390
                elif col == "ratio_segmented":
                    value = float(self.get_values("reads", timepoint=time) ) / jlist.d["reads"].d["segmented"][time]
391 392 393
                elif col == "filename":
                    value = jlist.d["samples"].d["original_names"][time]
                elif col == "warnings":
394 395
                    if "warn" not in self.d.keys():
                        value = ""
396 397 398 399 400 401
                    else:
                        warns = self.d["warn"]
                        values = []
                        for warn in warns:
                            values.append( warn["code"] )
                        value = ",".join(values)
402 403 404

                else:
                    value = GET_UNKNOW_AXIS
405 406

            elif col in airr_to_axes.keys():
407
                value = self.get_values(airr_to_axes[col], timepoint=time)
408
            else:
409
                value = GET_UNKNOW_AXIS
410

411
            if value == "?" or value == GET_UNKNOW_AXIS:
412
                # for some axis, there is no computable data; for the moment don't show anything
413
                value = UNKNOWN_VALUE
414 415 416 417
            list_values.append( str(value) )

        return list_values

418

419
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
420
        reads = self.d["reads"][point]
421
        ratio_germline = float(reads)/base_germline
422
        ratio = float(reads)/base
423
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
424 425
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
426

427 428
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
429
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
430
        
Marc Duez's avatar
Marc Duez committed
431
class Samples: 
432

Marc Duez's avatar
Marc Duez committed
433 434
    def __init__(self):
        self.d={}
435
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
436 437 438 439
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
440

441 442 443 444 445
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

446
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
447 448
        
        return obj
449

450 451 452
    def __str__(self):
        return "<Samples: %s>" % self.d

453 454
class Diversity: 

455 456
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

457 458 459
    def __init__(self, data=None):
        self.d={}

460 461 462 463 464
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
465

466 467 468
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
469 470 471 472 473
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
474 475 476 477
class Reads: 

    def __init__(self):
        self.d={}
478 479
        self.d["total"] = [0]
        self.d["segmented"] = [0]
480
        self.d["germline"] = {}
481
        self.d['distribution'] = {}
482 483 484

    def __add__(self, other):
        obj=Reads()
485 486 487 488 489

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
490 491 492 493
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
494 495 496 497
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
498

499 500 501 502 503 504 505 506 507 508 509 510
    def addAIRRClone(self, clone):
        """
        Allow to add a clone create by AIRR import. 
        Add reads values to germline, segmented and total section
        """
        if clone.d["germline"] not in self.d["germline"].keys():
            self.d["germline"][clone.d["germline"]] = [0]
        self.d["germline"][clone.d["germline"]][0]  += clone.d["reads"][0]
        self.d["total"][0]     += clone.d["reads"][0]
        self.d["segmented"][0] += clone.d["reads"][0]
        return

511 512
    def __str__(self):
        return "<Reads: %s>" % self.d
513
        
514 515 516 517
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

518 519
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
520 521 522 523 524 525 526 527 528 529
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

530
        for i, s in enumerate(window.d["reads"]):
531 532 533 534 535
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
536 537
            
        # print window, '-->', self.sizes
538 539 540 541 542 543 544
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
545
            w.d['reads'] = self.sizes[r]
546 547 548
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
549
            print('  --[others]-->', w)
550
            yield w
551
        
552
class ListWindows(VidjilJson):
553 554 555
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
556
    <ListWindows: [25] 2>
557 558 559 560
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
561
    <ListWindows: [34] 2>
562 563 564
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
565
    >>> lw3 = lw1 + lw2
566
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
567
    <ListWindows: [25, 34] 3>
568 569 570
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
571 572 573 574 575 576 577 578 579 580
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

581 582 583 584 585
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
586 587
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
588 589
        self.d["clones"] = []
        self.d["clusters"] = []
590
        self.d["germlines"] = {}
591
        
592 593 594 595
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
596
    def __str__(self):
597 598 599 600 601 602 603 604 605 606 607 608 609 610
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
611 612 613

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
614
        print(self)
615
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
616
            print(clone)
617
        
618 619
    ### compute statistics about clones
    def build_stat(self):
620
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
621
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
622

623 624
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
625 626 627 628 629 630
                if self.d['reads'].d['segmented'][i] > 0:
                    for r in range(len(ranges)):
                        if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
                            break
                    result[r][i] += s

631
        for r in range(len(ranges)):
632 633
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
634 635 636 637
            
        #TODO V/D/J distrib and more 
        
        
638 639 640
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
641
        print("==>", output)
642 643
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
644

645
    def load(self, file_path, *args, **kwargs):
646
        if '.clntab' in file_path:
647
            self.load_clntab(file_path, *args, **kwargs)
648 649 650 651
        elif ".tsv" in file_path or ".AIRR" in file_path or ".airr" in file_path:
            self.load_airr(file_path, *args, **kwargs)
        else:
            self.load_vidjil(file_path, *args, **kwargs)
652 653 654 655 656 657

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
658 659 660
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
661

662 663 664 665
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
666
        
667 668
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

Mathieu Giraud's avatar
Mathieu Giraud committed
686
        with generic_open(file_path, "r", verbose) as f:
687 688 689
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

690 691 692
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
693
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
694
                print("[%s]" % f)
695 696 697

        else:
            f = file_path
698
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
699
                print()
700

Marc Duez's avatar
Marc Duez committed
701
        time = os.path.getmtime(file_path)
702
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
703

704 705 706
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
707

Marc Duez's avatar
Marc Duez committed
708 709 710
    def getTop(self, top):
        result = []
        
711 712 713
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
714 715 716 717 718
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
719
        reverseList = {}
720 721
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
722 723 724
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
725
        
Marc Duez's avatar
Marc Duez committed
726
        for i in filteredList:
727
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
728
        
729
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
730
        
731 732 733 734
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
735 736
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
737

738 739 740
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
741 742
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
743
        
744
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
745 746
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
747
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
748
        
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
        try:
            ### Verify that same file is not present twice
            filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
            filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
            for filename in filename_jlist1:
                if filename in filename_jlist2:
                    raise( "Error, duplicate file name (in distributions) ")

            ### Append distributions of each files
            obj.d["distributions"] = {}
            obj.d["distributions"]["repertoires"] = {}
            obj.d["distributions"]["keys"]        = ["clones", "reads"]
            obj.d["distributions"]["filters"]     = {}
            obj.d["distributions"]["categories"]  = {}
            for filename in filename_jlist1:
                obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
            for filename in filename_jlist2:
                obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
        except:
            pass

770 771
        return obj
        
772 773 774 775 776 777
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
778
        self.d["clones"] += other.d["clones"]
779 780 781 782 783 784
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
785
    ### 
786
    def fuseWindows(self, w1, w2, l1, l2) :
787
        #store data in dict with "id" as key
788 789
        dico1 = {}
        for i in range(len(w1)) :
790
            dico1[ w1[i].d["id"] ] = w1[i]
791 792 793

        dico2 = {}
        for i in range(len(w2)) :
794
            dico2[ w2[i].d["id"] ] = w2[i]
795

796
        #concat data with the same key ("id")
797 798 799 800 801 802
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
803
                w=Window(l2)
804 805 806 807
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
808
                w=Window(l1)
809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
826 827
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
828 829

        w=[]
830 831 832

        others = OtherWindows(nb_points)

833
        for clone in self:
Thonier Florian's avatar
Thonier Florian committed
834
            if (int(clone.d["top"]) <= limit or limit == 0) :
835
                w.append(clone)
836
            #else:
837
                #others += clone
838

839
        self.d["clones"] = w #+ list(others) 
840

Mathieu Giraud's avatar
Mathieu Giraud committed
841
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
842 843
        return self
        
844
    def load_clntab(self, file_path, *args, **kwargs):
845 846 847
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
848
        self.d["samples"].d["original_names"] = [file_path]
849
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
865 866
                w.d["seg"] = {}
                
867
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
868
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
869 870
                s = int(tab["sequence.size"])
                total_size += s
871
                w.d["reads"] = [ s ]
872

873 874
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
875
                w.d["sequence"] = tab["sequence.raw nt seq"]
876
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
877
                if (tab["sequence.D-GENE and allele"] != "") :
878 879
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
880 881 882 883

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
884
                
885
                if position >= 0:
886 887
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
888
                else:
889 890 891 892 893 894
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
895
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
896
                    
897
                listw.append((w , w.d["reads"][0]))
898 899

                raw_clonotype = tab.get("clonotype")
900 901
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
918
            self.d["clones"].append(listw[index][0])
919
        
920 921
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
922
        
Mathieu Giraud's avatar
Mathieu Giraud committed
923
    def load_airr(self, file_path, pipeline, verbose=True):
flothoni's avatar
flothoni committed
924 925 926 927
        '''
        Parser for AIRR files
        format: https://buildmedia.readthedocs.org/media/pdf/airr-standards/stable/airr-standards.pdf
        '''
928 929 930 931 932 933 934 935

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        self.d["samples"].d["original_names"] = [file_path]
        self.d["samples"].d["producer"]       = ["unknown (AIRR format)"]
        self.d["samples"].d["log"]            = ["Created from an AIRR format. No other information available"]
        self.d["reads"].d["germline"] = defaultdict(lambda: [0])
        self.d["diversity"] = Diversity()

flothoni's avatar
flothoni committed
936 937 938
        ### Store created clone id
        clone_ids = defaultdict(lambda: False)

939 940 941 942 943
        listw = []
        listc = []
        total_size = 0
        i= 0
        import csv
Mathieu Giraud's avatar
Mathieu Giraud committed
944
        with generic_open(file_path, 'r', verbose) as tsvfile:
945
          reader = csv.DictReader(tsvfile, dialect='excel-tab')
946

947
          for row in reader:
948
            row = self.airr_rename_cols(row)
949 950 951 952 953 954 955 956 957 958
            i += 1

            ### bypass igBlast format
            if "Total queries" in row["sequence_id"]:
                print( "here")
                break



            w=Window(1)
959
            w.d["seg"] = { "junction":{}, "cdr3":{} }
960
            
961 962 963 964 965 966 967 968 969
            w.d["id"] = row["sequence_id"]
            # controle that no other clone is presetn with this id
            p = 1
            while clone_ids[w.d["id"]]:
                w.d["id"] = row["sequence_id"] + "_%s" % p
                p += 1
            clone_ids[w.d["id"]] = True
            
            
970 971 972 973
            w.d["sequence"] = row["sequence"]
            if "duplicate_count" not in row.keys() or row["duplicate_count"] == "":
                w.d["reads"] = [1]
            else :
974 975 976
                w.d["reads"] = [ int(float(row["duplicate_count"])) ]


Mathieu Giraud's avatar
Mathieu Giraud committed
977 978
            ## 'Regular' columns, translated straight from AIRR into .vidjil

979 980 981 982 983 984 985 986 987 988 989 990
            axes = {"locus":  ["germline"],
                    "v_call": ["seg", "5", "name"],
                    "d_call": ["seg", "4", "name"],
                    "j_call": ["seg", "3", "name"],
                    "v_alignment_start":   ["seg","5","start"],
                    "v_alignment_end":     ["seg","5","stop"],
                    "d_alignment_start":   ["seg","4","start"],
                    "d_alignment_end":     ["seg","4","stop"],
                    "j_alignment_start":   ["seg","3","start"],
                    "j_alignment_end":     ["seg","3","stop"],
                    "junction_aa":         ["seg","junction","aa"],
                    "productive":          ["seg","junction","productive"],
991 992 993 994
                    "cdr1_start":          ["seg","cdr1", "start"],
                    "cdr1_end":            ["seg","cdr1", "stop"],
                    "cdr3_start":          ["seg","cdr3", "start"],
                    "cdr3_end":            ["seg","cdr3", "stop"],
995
                    "5prime_trimmed_n_nb": ["seg","5","delLeft"],
996 997
                    "3prime_trimmed_n_nb": ["seg","3","delLeft"],
                    "warnings":            ["warn"]}
998 999
            

Mathieu Giraud's avatar
Mathieu Giraud committed
1000
            ## Fill .vidjil values with a recursive call
1001
            for axe in axes.keys():
1002
                if axe in row.keys() and row[axe] != "":
1003 1004 1005
                    path   = axes[axe]
                    depth  = 0
                    value  = w.d
1006
                    to_put = self.airr_clean_content(axe, row[axe])
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017

                    while depth != len(path):
                        cat = path[depth]
                        if cat not in value.keys():
                            value[cat] = {}
                        depth += 1
                        
                        if depth != len(path):
                            value  = value[cat]

                    value[cat] = to_put
1018 1019 1020


            listw.append((w , w.d["reads"][0]))
1021

Mathieu Giraud's avatar
Mathieu Giraud committed
1022
            ##### Special values in .vidjil, recomputed from AIRR data
1023

1024
            ### NAME (simple, vidjil like)
flothoni's avatar
flothoni committed
1025 1026 1027 1028 1029 1030