fuse.py 36.5 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43
from pipes import quote
44

45
FUSE_VERSION = "vidjil fuse"
46

47
TOOL_SIMILARITY = "../algo/tools/similarity"
48
SIMILARITY_LIMIT = 100
49

50 51
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

52 53 54
####

class Window:
55
    # Should be renamed "Clone"
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
Thonier Florian's avatar
Thonier Florian committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
    >>> w1.get_values("name")
    '?'
    
    >>> w1.get_values("top")
    3
  
    >>> w7   = Window(1)
    >>> w7.d = seg_w7
    >>> w7.d["seg"]["5"]["name"]
    'TRAV17*01'
      
    >>> w7.get_values("name")
    'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
    
    >>> w7.get_values("seg5")
    'TRAV17*01'
90 91 92 93 94
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
95
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
96
        self.d["top"] = sys.maxsize
97
        self.d["reads"] = []
98
        for i in range(size):
99
            self.d["reads"].append(0)
100 101 102
        
        
    ### 
103 104
    def __iadd__(self, other):
        ### Not used now
105
        """Add other.reads to self.reads in-place, without extending lists"""
106

107 108
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
109 110 111

        return self
        
112
    def __add__(self, other):
113
        """Concat two windows, extending lists such as 'reads'"""
114
        #data we don't need to duplicate
115
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
116 117
        obj = Window(1)
        
118 119 120 121 122
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
123 124 125 126
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
127
                    
128 129 130 131 132 133 134 135 136
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

137 138
        return obj
        
139 140
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
141

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    def get_reads(self, t=-1):
        if t== -1:
            return self.d["reads"]
        else:
            return self.d["reads"][t]

    def is_segmented(self):
        if not "seg" in self.d.keys():
            return False
        return True


    def get_axes_values(self, axes, timepoint):
        """Return the values of given axes for this clone at a gvien timepoint"""
        values = []
        for axis in axes:
            values.append(self.get_values(axis, timepoint))
        return values


    def get_values(self, axis, timepoint=0):
        """Return the value of an axis for this clone at a given timepoint"""

        axes = {
            "top":      ["top"],
            "germline": ["germline"],
            "name":     ["name"],
            "seg5":     ["seg",  "5", "name"],
            "seg4":     ["seg",  "4", "name"],
            "seg4a":    ["seg", "4a", "name"],
            "seg4b":    ["seg", "4b", "name"],
            "seg3":     ["seg",  "3", "name"],
            "evalue":   ["evalue","val"],
            "seg5_delRight": ["seg","5","delRight"],
            "seg3_delLeft":  ["seg","3","delLeft"],
            "seg4_delRight": ["seg","4","delRight"],
            "seg3_delLeft":  ["seg","4","delLeft"],
            ### Positions
            "seg5_stop":  ["seg","5","stop"],
            "seg3_start": ["seg","3","start"],
            "seg4_stop":  ["seg","4","stop"],
            "seg4_start": ["seg","4","start"],
            "cdr3_stop":  ["seg","cdr3","stop"],
            "cdr3_start": ["seg","cdr3","start"],
            "junction_stop":  ["seg","junction","stop"],
            "junction_start": ["seg","junction","start"],
            "productive":     ["seg","junction","productive"],
            "lenSeqAverage" : ["_average_read_length"]
        }

        ### length
        axes_length = {
            "insert_53": ["seg3_start", "seg5_stop", -1],
            "insert_54": ["seg4_start", "seg5_stop", -1],
            "insert_43": ["seg3_start", "seg4_stop", -1],
            "lenCDR3": ["cdr3_stop", "cdr3_start", 0],
            "lenJunction": ["junction_stop", "junction_start", 0]
        }

        try:
            if axis in axes.keys():
                path  = axes[axis]
                depth = 0
                value = self.d
                if path[0] == "seg" and not self.is_segmented():
                    return "?"

                while depth != len(path):
                    value  = value[path[depth]]
                    depth += 1
                if type(value) is list:
                    # In these cases, should be a list of value at different timepoint
                    return value[timepoint]
                return value

            return "unknow_axis"
        except:
            return "?"

221
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
222
        reads = self.d["reads"][point]
223
        ratio_germline = float(reads)/base_germline
224
        ratio = float(reads)/base
225
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
226 227
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
228

229 230
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
231
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
232
        
Marc Duez's avatar
Marc Duez committed
233
class Samples: 
234

Marc Duez's avatar
Marc Duez committed
235 236
    def __init__(self):
        self.d={}
237
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
238 239 240 241
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
242

243 244 245 246 247
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

248
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
249 250
        
        return obj
251

252 253 254
    def __str__(self):
        return "<Samples: %s>" % self.d

255 256
class Diversity: 

257 258
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

259 260 261
    def __init__(self, data=None):
        self.d={}

262 263 264 265 266
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
267

268 269 270
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
271 272 273 274 275
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
276 277 278 279 280 281 282
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
283
        self.d['distribution'] = {}
284 285 286

    def __add__(self, other):
        obj=Reads()
287 288 289 290 291

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
292 293 294 295
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
296 297 298 299
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
300 301 302

    def __str__(self):
        return "<Reads: %s>" % self.d
303
        
304 305 306 307
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

308 309
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
310 311 312 313 314 315 316 317 318 319
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

320
        for i, s in enumerate(window.d["reads"]):
321 322 323 324 325
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
326 327
            
        # print window, '-->', self.sizes
328 329 330 331 332 333 334
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
335
            w.d['reads'] = self.sizes[r]
336 337 338
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
339
            print('  --[others]-->', w)
340
            yield w
341
        
342
class ListWindows(VidjilJson):
343 344 345
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
346
    <ListWindows: [25] 2>
347 348 349 350
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
351
    <ListWindows: [34] 2>
352 353 354
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
355
    >>> lw3 = lw1 + lw2
356
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
357
    <ListWindows: [25, 34] 3>
358 359 360
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
361 362 363 364 365 366 367 368 369 370
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

371 372 373 374 375
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
376 377
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
378 379
        self.d["clones"] = []
        self.d["clusters"] = []
380
        self.d["germlines"] = {}
381
        
382 383 384 385
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
386
    def __str__(self):
387 388 389 390 391 392 393 394 395 396 397 398 399 400
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
401 402 403

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
404
        print(self)
405
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
406
            print(clone)
407
        
408 409
    ### compute statistics about clones
    def build_stat(self):
410
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
411
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
412

413 414
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
415 416 417 418 419 420
                if self.d['reads'].d['segmented'][i] > 0:
                    for r in range(len(ranges)):
                        if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
                            break
                    result[r][i] += s

421
        for r in range(len(ranges)):
422 423
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
424 425 426 427
            
        #TODO V/D/J distrib and more 
        
        
428 429 430
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
431
        print("==>", output)
432 433
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
434

435
    def load(self, file_path, *args, **kwargs):
436 437 438 439
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
440 441 442 443 444 445

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
446 447 448
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
449

450 451 452 453
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
454
        
455 456
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

481 482 483
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
484
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
485
                print("[%s]" % f)
486 487 488

        else:
            f = file_path
489
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
490
                print()
491

Marc Duez's avatar
Marc Duez committed
492
        time = os.path.getmtime(file_path)
493
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
494

495 496 497
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
498

Marc Duez's avatar
Marc Duez committed
499 500 501
    def getTop(self, top):
        result = []
        
502 503 504
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
505 506 507 508 509
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
510
        reverseList = {}
511 512
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
513 514 515
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
516
        
Marc Duez's avatar
Marc Duez committed
517
        for i in filteredList:
518
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
519
        
520
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
521
        
522 523 524 525
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
526 527
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
528

529 530 531
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
532 533
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
534
        
535
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
536 537
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
538
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
539
        
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
        try:
            ### Verify that same file is not present twice
            filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
            filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
            for filename in filename_jlist1:
                if filename in filename_jlist2:
                    raise( "Error, duplicate file name (in distributions) ")

            ### Append distributions of each files
            obj.d["distributions"] = {}
            obj.d["distributions"]["repertoires"] = {}
            obj.d["distributions"]["keys"]        = ["clones", "reads"]
            obj.d["distributions"]["filters"]     = {}
            obj.d["distributions"]["categories"]  = {}
            for filename in filename_jlist1:
                obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
            for filename in filename_jlist2:
                obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
        except:
            pass

561 562
        return obj
        
563 564 565 566 567 568
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
569
        self.d["clones"] += other.d["clones"]
570 571 572 573 574 575
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
576
    ### 
577
    def fuseWindows(self, w1, w2, l1, l2) :
578
        #store data in dict with "id" as key
579 580
        dico1 = {}
        for i in range(len(w1)) :
581
            dico1[ w1[i].d["id"] ] = w1[i]
582 583 584

        dico2 = {}
        for i in range(len(w2)) :
585
            dico2[ w2[i].d["id"] ] = w2[i]
586

587
        #concat data with the same key ("id")
588 589 590 591 592 593
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
594
                w=Window(l2)
595 596 597 598
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
599
                w=Window(l1)
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
617 618
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
619 620

        w=[]
621 622 623

        others = OtherWindows(nb_points)

624 625 626
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
627
            #else:
628
                #others += clone
629

630
        self.d["clones"] = w #+ list(others) 
631

Mathieu Giraud's avatar
Mathieu Giraud committed
632
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
633 634
        return self
        
635
    def load_clntab(self, file_path, *args, **kwargs):
636 637 638
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
639
        self.d["samples"].d["original_names"] = [file_path]
640
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
656 657
                w.d["seg"] = {}
                
658
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
659
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
660 661
                s = int(tab["sequence.size"])
                total_size += s
662
                w.d["reads"] = [ s ]
663

664 665
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
666
                w.d["sequence"] = tab["sequence.raw nt seq"]
667
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
668
                if (tab["sequence.D-GENE and allele"] != "") :
669 670
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
671 672 673 674

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
675
                
676
                if position >= 0:
677 678
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
679
                else:
680 681 682 683 684 685
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
686
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
687
                    
688
                listw.append((w , w.d["reads"][0]))
689 690

                raw_clonotype = tab.get("clonotype")
691 692
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
709
            self.d["clones"].append(listw[index][0])
710
        
711 712
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
713 714 715 716

        
    def toJson(self, obj):
        '''Serializer for json module'''
717
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
718 719 720
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
721
                
722 723
            return result
            raise TypeError(repr(obj) + " fail !") 
724
        
Marc Duez's avatar
Marc Duez committed
725
        else:
726 727 728 729 730 731
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
732 733 734

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
735
        if "samples" in obj_dict:
736
            obj = ListWindows()
737
            obj.d=obj_dict
738 739
            return obj

740
        if "id" in obj_dict:
741
            obj = Window(1)
742
            obj.d=obj_dict
743
            return obj
Marc Duez's avatar
Marc Duez committed
744
        
745 746
        if "total" in obj_dict:
            obj = Reads()
747
            obj.d=obj_dict
748 749
            return obj
            
Marc Duez's avatar
Marc Duez committed
750 751
        if "original_names" in obj_dict:
            obj = Samples()
752
            obj.d=obj_dict
753
            return obj
Marc Duez's avatar
Marc Duez committed
754
            
755
        return obj_dict
756
        
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
    def get_filename_pos(self, filename):
        """ filename is the key of distributions repertoires """
        return self.d["samples"].d["original_names"].index(filename)


    # ========================= #
    #  Distributions computing  #
    # ========================= #
    def init_distrib(self, list_distrib):
        """ Create distributions structures """
        self.d["distributions"] = {}
        self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
        self.d["distributions"]["keys"]        = ["clones", "reads"]
        self.d["distributions"]["filters"]     = []
        # Warning, will fail of there are same original_names; fixme

        nb_sample = self.d["samples"].d["number"]
        # prefill distribution for each file of jlist
        for filename in self.d["samples"].d["original_names"]:
            for distrib in list_distrib:
                self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
        return


    def compute_distribution(self, list_distrib):
        """ list_distrib is a list of distrib to compute """ 
        for filename in self.d["samples"].d["original_names"]:
            timepoint = self.get_filename_pos(filename)
            for clone in self.d["clones"]:
                if clone.d["reads"][timepoint] != 0:
                    for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
                        distrib      = list_distrib[distrib_pos]
                        clone_values = clone.get_axes_values( distrib, timepoint )
                        nb_reads     = clone.get_reads(self.get_filename_pos(filename))
                        self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
        return


    def add_clone_values(self, filename, distrib_pos, values, nb_reads):
        """ Add value of a clone to a specific distribution """
        obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
        obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
        self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
        return

    def recursive_add( self, obj, values, nb_reads):
        """ Add by recusivity the newer value to existing value; return the obj """
        if len(values) > 1:
            # Should increase for one depth
            nvalues = values[1:]
            obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
        else:
            # last depth
            if not obj:
                obj = defaultdict(lambda: False)
            if not obj[values[0]]:
                obj[values[0]] = [0, 0]

            obj[values[0]][0] += 1
            obj[values[0]][1] += nb_reads
        return obj

819 820 821 822


### some data used for test
w1 = Window(1)
823
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
824
w2 = Window(1)
825
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
826
w3 = Window(1)
827
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
828
w4 = Window(1)
829
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
830 831 832


w5 = Window(1)
833
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
834
w6 = Window(1)
835
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
836

837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
seg_w7 = {
  "germline": "TRA",
  "id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
  "name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
  "reads": [
    16
  ],
  "seg": {
    "3": {
      "delLeft": 5,
      "name": "TRAJ40*01",
      "start": 112
    },
    "5": {
      "delRight": 0,
      "name": "TRAV17*01",
      "stop": 105
    },
    "N": 6,
    "cdr3": {
      "aa": "ATDAG#SGTYKYI",
      "start": 96,
      "stop": 133
    }
  },
  "sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
  "top": 26
}

866
lw1 = ListWindows()
867
lw1.d["timestamp"] = 'ts'
868
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
869 870
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
871 872
lw1.d["diversity"] = Diversity()

873 874

w7 = Window(1)
875
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
876
w8 = Window(1)
877
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
878 879

lw2 = ListWindows()
880
lw2.d["timestamp"] = 'ts'
881
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
882 883
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
884
lw2.d["diversity"] = Diversity()
885 886

    
887 888 889 890 891
def exec_command(command, directory, input_file):
    '''
    Execute the command `command` from the directory
    `directory`. The executable must exist in
    this directory. No path changes are allowed in `command`.
892
    
893 894 895 896 897 898 899 900 901 902 903 904 905
    Returns the output filename (a .vidjil). 
    '''
    assert (not os.path.sep in command), "No {} allowed in the command name".format(os.path.sep)
    ff = tempfile.NamedTemporaryFile(suffix='.vidjil', delete=False)

    basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
    command_fullpath = basedir+os.path.sep+directory+os.path.sep+command
    com = '%s %s %s' % (quote(command_fullpath), quote(os.path.abspath(input_file)), ff.name)
    print(com)
    os.system(com)
    print()
    return ff.name

906 907 908

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
909
    print("#", ' '.join(sys.argv))
910 911 912 913 914 915 916

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
917
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
918 919 920 921 922 923
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
924
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
925
    group_options.add_argument('--distributions', '-d', action='store_true', help='compute distributions')
Mathieu Giraud's avatar
Mathieu Giraud committed
926
    group_options.add_argument('--no-clones', action='store_true', help='do not output individual clones')
927
    
928
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
929
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
930
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
931

Marc Duez's avatar
Marc Duez committed
932
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
933 934
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

935
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
936

Mathieu Giraud's avatar
Mathieu Giraud committed
937
    group_options.add_argument('--pre', type=str,help='pre-process program (launched on each input .vidjil file) (needs defs.PRE_PROCESS_DIR)')
938

939 940 941 942 943 944 945 946 947 948 949 950
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Thonier Florian's avatar
Thonier Florian committed
951 952
    LIST_DISTRIBUTIONS   = []
    LIST_AXIS = ["germline",
953
        "seg5", "seg4", "seg3",
Mathieu Giraud's avatar
Mathieu Giraud committed
954 955
        "lenCDR3",
        "lenSeqConsensus", "lenSeqAverage",
956 957
        "seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg3_delLeft",
        "insert_53", "insert_54", "insert_43",
Thonier Florian's avatar
Thonier Florian committed
958
        "productive", 
959
    ]
Thonier Florian's avatar
Thonier Florian committed
960 961 962 963 964 965

    for axis1 in LIST_AXIS:
        LIST_DISTRIBUTIONS.append([axis1])
        for axis2 in LIST_AXIS:
            if axis1 != axis2:
                LIST_DISTRIBUTIONS.append([axis1, axis2])
966
    
Mathieu Giraud's avatar
Mathieu Giraud committed
967 968
    print("### fuse.py -- " + DESCRIPTION)
    print()
969

970 971 972 973 974 975
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len<