fuse.py 23.3 KB
Newer Older
1 2 3 4 5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7 8 9 10 11 12 13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29 30 31 32 33 34 35 36

import sys

PY_REQUIRED = (2, 7)
if sys.version_info < PY_REQUIRED:
    print("This script requires Python >= %d.%d." % (PY_REQUIRED))
    sys.exit(1)

37 38 39 40
import json
import argparse
import time
import copy
41
import os.path
Marc Duez's avatar
Marc Duez committed
42
import datetime
43
from operator import itemgetter
44
from utils import *
45

46
VIDJIL_JSON_VERSION = "2014.10"
47
FUSE_VERSION = "vidjil fuse"
48

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105 106 107 108 109 110 111 112 113 114 115 116 117
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118 119 120 121 122
    def latex(self, point=0, base=10000):
        reads = self.d["reads"][point]
        ratio = float(reads)/base
        return r"   &   & %7d & %5.2f%% & %-50s \\ %% %s" % (reads, ratio * 100, 
                                                           self.d["name"] if 'name' in self.d else self.d["id"], self.d["id"])
123

124 125
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
126
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
127
        
Marc Duez's avatar
Marc Duez committed
128
class Samples: 
129

Marc Duez's avatar
Marc Duez committed
130 131
    def __init__(self):
        self.d={}
132
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
133 134 135 136
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
137

138 139 140 141 142
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

143
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
144 145
        
        return obj
146

147 148 149
    def __str__(self):
        return "<Samples: %s>" % self.d

150 151 152 153 154 155 156
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
157
        self.d['distribution'] = {}
158 159 160

    def __add__(self, other):
        obj=Reads()
161 162 163 164 165

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
166 167 168 169
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
170 171 172 173
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
174 175 176

    def __str__(self):
        return "<Reads: %s>" % self.d
177
        
178 179 180 181
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

182 183
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
184 185 186 187 188 189 190 191 192 193
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

194
        for i, s in enumerate(window.d["reads"]):
195 196 197 198 199
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
200 201
            
        # print window, '-->', self.sizes
202 203 204 205 206 207 208
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
209
            w.d['reads'] = self.sizes[r]
210 211 212
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
213
            print('  --[others]-->', w)
214
            yield w
215 216 217 218 219
        
class ListWindows:
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
220
    <ListWindows: [25] 2>
221 222 223 224
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
225
    <ListWindows: [34] 2>
226 227 228
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
229
    >>> lw3 = lw1 + lw2
230
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
231
    <ListWindows: [25, 34] 3>
232 233 234
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
235 236 237 238 239 240 241 242 243 244
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

245 246 247 248 249
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
250 251
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
252 253
        self.d["clones"] = []
        self.d["clusters"] = []
254
        self.d["germlines"] = {}
255
        
256 257 258 259
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
260
    def __str__(self):
261 262 263 264 265 266 267 268 269 270 271 272 273 274
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
275 276 277

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
278
        print(self)
279
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
280
            print(clone)
281 282 283 284

    ### check vidjil_json_version
    def check_version(self, filepath):
        if "vidjil_json_version" in self.d:
285
            if self.d["vidjil_json_version"] <= VIDJIL_JSON_VERSION:
286 287 288
                return
        raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
        
289 290
    ### compute statistics about clones
    def build_stat(self):
291
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
292
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
293

294 295
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
296
                for r in range(len(ranges)):
297
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
298 299 300 301
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
302 303
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
304 305 306 307
            
        #TODO V/D/J distrib and more 
        
        
308 309 310
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
311
        print("==>", output)
312 313
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
314 315 316 317 318 319 320 321

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
322 323 324 325 326
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
327 328

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
329
            print("<==", file_path, "\t", end=' ')
330
        
331
        with open(file_path, "r") as file:
332 333 334 335
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
336 337
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
338 339 340
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
341
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
342
                print("[%s]" % f)
343 344 345

        else:
            f = file_path
346
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
347
                print()
348
        
Marc Duez's avatar
Marc Duez committed
349
        time = os.path.getmtime(file_path)
350
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
351

Marc Duez's avatar
Marc Duez committed
352 353 354
    def getTop(self, top):
        result = []
        
355 356 357
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
358 359 360 361 362
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
363
        reverseList = {}
364 365
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
366 367 368
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
369
        
Marc Duez's avatar
Marc Duez committed
370
        for i in filteredList:
371
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
372
        
373
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
374
        
375 376 377 378
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
379 380
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
381

382 383 384
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
385 386
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
387
        
388
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
389 390
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
391
        
392 393
        return obj
        
394 395 396 397 398 399
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
400
        self.d["clones"] += other.d["clones"]
401 402 403 404 405 406
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
407
    ### 
408
    def fuseWindows(self, w1, w2, l1, l2) :
409
        #store data in dict with "id" as key
410 411
        dico1 = {}
        for i in range(len(w1)) :
412
            dico1[ w1[i].d["id"] ] = w1[i]
413 414 415

        dico2 = {}
        for i in range(len(w2)) :
416
            dico2[ w2[i].d["id"] ] = w2[i]
417

418
        #concat data with the same key ("id")
419 420 421 422 423 424
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
425
                w=Window(l2)
426 427 428 429
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
430
                w=Window(l1)
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
448 449
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
450

451
        length = len(self)
452
        w=[]
453 454 455

        others = OtherWindows(nb_points)

456 457 458
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
459
            #else:
460
                #others += clone
461

462
        self.d["clones"] = w #+ list(others) 
463

Mathieu Giraud's avatar
Mathieu Giraud committed
464
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
465 466
        return self
        
467
    def load_clntab(self, file_path, *args, **kwargs):
468 469 470
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
471
        self.d["samples"].d["original_names"] = [file_path]
472
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
488 489
                w.d["seg"] = {}
                
490
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
491
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
492 493
                s = int(tab["sequence.size"])
                total_size += s
494
                w.d["reads"] = [ s ]
495

496 497
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
498
                w.d["sequence"] = tab["sequence.raw nt seq"]
499
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
500
                if (tab["sequence.D-GENE and allele"] != "") :
501 502
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
503 504 505 506

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
507
                
508
                if position >= 0:
509 510
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
511
                else:
512 513 514 515 516 517
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
518
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
519
                    
520
                listw.append((w , w.d["reads"][0]))
521 522

                raw_clonotype = tab.get("clonotype")
523 524
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
541
            self.d["clones"].append(listw[index][0])
542
        
543 544
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
545 546 547 548

        
    def toJson(self, obj):
        '''Serializer for json module'''
549
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
550 551 552
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
553
                
554 555
            return result
            raise TypeError(repr(obj) + " fail !") 
556
        
Marc Duez's avatar
Marc Duez committed
557
        else:
558 559 560 561 562 563
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
564 565 566

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
567
        if "samples" in obj_dict:
568
            obj = ListWindows()
569
            obj.d=obj_dict
570 571
            return obj

572
        if "id" in obj_dict:
573
            obj = Window(1)
574
            obj.d=obj_dict
575
            return obj
Marc Duez's avatar
Marc Duez committed
576
        
577 578
        if "total" in obj_dict:
            obj = Reads()
579
            obj.d=obj_dict
580 581
            return obj
            
Marc Duez's avatar
Marc Duez committed
582 583
        if "original_names" in obj_dict:
            obj = Samples()
584
            obj.d=obj_dict
585
            return obj
Marc Duez's avatar
Marc Duez committed
586
            
587
        return obj_dict
588 589 590 591 592 593
        



### some data used for test
w1 = Window(1)
594
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
595
w2 = Window(1)
596
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
597
w3 = Window(1)
598
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
599
w4 = Window(1)
600
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
601 602 603


w5 = Window(1)
604
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
605
w6 = Window(1)
606
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
607 608

lw1 = ListWindows()
609
lw1.d["timestamp"] = 'ts'
610
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
611 612
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
613 614

w7 = Window(1)
615
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
616
w8 = Window(1)
617
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
618 619

lw2 = ListWindows()
620
lw2.d["timestamp"] = 'ts'
621
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
622 623
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
624 625 626 627 628 629

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
630
    print("#", ' '.join(sys.argv))
631 632 633 634 635 636 637

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
638
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
639 640 641 642 643 644
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
645
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
646
    
647
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
648
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
649

Marc Duez's avatar
Marc Duez committed
650
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
651 652 653 654 655 656 657 658 659 660 661 662 663 664
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
665 666
    print("### fuse.py -- " + DESCRIPTION)
    print()
667

Marc Duez's avatar
Marc Duez committed
668 669 670 671 672 673 674 675
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
676
    if args.multi:
677 678
        for path_name in args.file:
            jlist = ListWindows()
679
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
680
            jlist.build_stat()
681
            
Mathieu Giraud's avatar
Mathieu Giraud committed
682
            print("\t", jlist, end=' ')
683 684 685 686 687

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
688
                
Mathieu Giraud's avatar
Mathieu Giraud committed
689
            print('\t==> merge to', jlist_fused)
690
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
691
        
692
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
693
        print("### Read and merge input files")
694 695
        for path_name in args.file:
            jlist = ListWindows()
696
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
697 698
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
699

700 701 702 703
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
704
            print("\t", jlist, end=' ')
705 706 707 708 709
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
710
            
Mathieu Giraud's avatar
Mathieu Giraud committed
711
            print('\t==> merge to', jlist_fused)
712

713
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
714 715
        print()
        print("### Select point names")
716
        l = jlist_fused.d["samples"].d["original_names"]
717
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
718 719
        print("  <==", l)
        print("  ==>", ll)
720
        jlist_fused.d["samples"].d["names"] = ll
721
    
Mathieu Giraud's avatar
Mathieu Giraud committed
722
    print()
723
    if not args.multi:
724
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
725 726
    print("\t", jlist_fused) 
    print()
727

Mathieu Giraud's avatar
Mathieu Giraud committed
728
    print("### Save merged file")
729 730 731 732 733 734
    jlist_fused.save_json(args.output)

    
    
if  __name__ =='__main__':
    main()