fuse.py 23.7 KB
Newer Older
1 2 3 4 5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7 8 9 10 11 12 13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33 34 35
import json
import argparse
import time
import copy
36
import os.path
37
import os
Marc Duez's avatar
Marc Duez committed
38
import datetime
39
import subprocess
40
import tempfile
41
from operator import itemgetter
42
from utils import *
43
from defs import *
44

45
FUSE_VERSION = "vidjil fuse"
46

47 48
TOOL_SIMILARITY = "../algo/tools/similarity"

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105 106 107 108 109 110 111 112 113 114 115 116 117
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
119
        reads = self.d["reads"][point]
120
        ratio_germline = float(reads)/base_germline
121
        ratio = float(reads)/base
122
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
123 124
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
125

126 127
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
128
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
129
        
Marc Duez's avatar
Marc Duez committed
130
class Samples: 
131

Marc Duez's avatar
Marc Duez committed
132 133
    def __init__(self):
        self.d={}
134
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
135 136 137 138
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
139

140 141 142 143 144
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

145
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
146 147
        
        return obj
148

149 150 151
    def __str__(self):
        return "<Samples: %s>" % self.d

152 153 154 155 156 157 158
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
159
        self.d['distribution'] = {}
160 161 162

    def __add__(self, other):
        obj=Reads()
163 164 165 166 167

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
168 169 170 171
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
172 173 174 175
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
176 177 178

    def __str__(self):
        return "<Reads: %s>" % self.d
179
        
180 181 182 183
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

184 185
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
186 187 188 189 190 191 192 193 194 195
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

196
        for i, s in enumerate(window.d["reads"]):
197 198 199 200 201
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
202 203
            
        # print window, '-->', self.sizes
204 205 206 207 208 209 210
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
211
            w.d['reads'] = self.sizes[r]
212 213 214
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
215
            print('  --[others]-->', w)
216
            yield w
217
        
218
class ListWindows(VidjilJson):
219 220 221
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
222
    <ListWindows: [25] 2>
223 224 225 226
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
227
    <ListWindows: [34] 2>
228 229 230
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
231
    >>> lw3 = lw1 + lw2
232
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
233
    <ListWindows: [25, 34] 3>
234 235 236
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
237 238 239 240 241 242 243 244 245 246
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

247 248 249 250 251
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
252 253
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
254 255
        self.d["clones"] = []
        self.d["clusters"] = []
256
        self.d["germlines"] = {}
257
        
258 259 260 261
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
262
    def __str__(self):
263 264 265 266 267 268 269 270 271 272 273 274 275 276
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
277 278 279

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
280
        print(self)
281
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
282
            print(clone)
283
        
284 285
    ### compute statistics about clones
    def build_stat(self):
286
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
287
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
288

289 290
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
291
                for r in range(len(ranges)):
292
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
293 294 295 296
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
297 298
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
299 300 301 302
            
        #TODO V/D/J distrib and more 
        
        
303 304 305
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
306
        print("==>", output)
307 308
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
309 310 311 312 313 314 315 316

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
317 318 319 320 321
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
322 323

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
324
            print("<==", file_path, "\t", end=' ')
325
        
326
        with open(file_path, "r") as file:
327 328 329 330
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
331 332
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
333 334 335
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
336
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
337
                print("[%s]" % f)
338 339 340

        else:
            f = file_path
341
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
342
                print()
343
        
Marc Duez's avatar
Marc Duez committed
344
        time = os.path.getmtime(file_path)
345
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
346

Marc Duez's avatar
Marc Duez committed
347 348 349
    def getTop(self, top):
        result = []
        
350 351 352
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
353 354 355 356 357
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
358
        reverseList = {}
359 360
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
361 362 363
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
364
        
Marc Duez's avatar
Marc Duez committed
365
        for i in filteredList:
366
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
367
        
368
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
369
        
370 371 372 373
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
374 375
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
376

377 378 379
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
380 381
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
382
        
383
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
384 385
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
386
        
387 388
        return obj
        
389 390 391 392 393 394
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
395
        self.d["clones"] += other.d["clones"]
396 397 398 399 400 401
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
402
    ### 
403
    def fuseWindows(self, w1, w2, l1, l2) :
404
        #store data in dict with "id" as key
405 406
        dico1 = {}
        for i in range(len(w1)) :
407
            dico1[ w1[i].d["id"] ] = w1[i]
408 409 410

        dico2 = {}
        for i in range(len(w2)) :
411
            dico2[ w2[i].d["id"] ] = w2[i]
412

413
        #concat data with the same key ("id")
414 415 416 417 418 419
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
420
                w=Window(l2)
421 422 423 424
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
425
                w=Window(l1)
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
443 444
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
445

446
        length = len(self)
447
        w=[]
448 449 450

        others = OtherWindows(nb_points)

451 452 453
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
454
            #else:
455
                #others += clone
456

457
        self.d["clones"] = w #+ list(others) 
458

Mathieu Giraud's avatar
Mathieu Giraud committed
459
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
460 461
        return self
        
462
    def load_clntab(self, file_path, *args, **kwargs):
463 464 465
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
466
        self.d["samples"].d["original_names"] = [file_path]
467
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
483 484
                w.d["seg"] = {}
                
485
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
486
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
487 488
                s = int(tab["sequence.size"])
                total_size += s
489
                w.d["reads"] = [ s ]
490

491 492
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
493
                w.d["sequence"] = tab["sequence.raw nt seq"]
494
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
495
                if (tab["sequence.D-GENE and allele"] != "") :
496 497
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
498 499 500 501

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
502
                
503
                if position >= 0:
504 505
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
506
                else:
507 508 509 510 511 512
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
513
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
514
                    
515
                listw.append((w , w.d["reads"][0]))
516 517

                raw_clonotype = tab.get("clonotype")
518 519
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
536
            self.d["clones"].append(listw[index][0])
537
        
538 539
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
540 541 542 543

        
    def toJson(self, obj):
        '''Serializer for json module'''
544
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
545 546 547
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
548
                
549 550
            return result
            raise TypeError(repr(obj) + " fail !") 
551
        
Marc Duez's avatar
Marc Duez committed
552
        else:
553 554 555 556 557 558
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
559 560 561

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
562
        if "samples" in obj_dict:
563
            obj = ListWindows()
564
            obj.d=obj_dict
565 566
            return obj

567
        if "id" in obj_dict:
568
            obj = Window(1)
569
            obj.d=obj_dict
570
            return obj
Marc Duez's avatar
Marc Duez committed
571
        
572 573
        if "total" in obj_dict:
            obj = Reads()
574
            obj.d=obj_dict
575 576
            return obj
            
Marc Duez's avatar
Marc Duez committed
577 578
        if "original_names" in obj_dict:
            obj = Samples()
579
            obj.d=obj_dict
580
            return obj
Marc Duez's avatar
Marc Duez committed
581
            
582
        return obj_dict
583 584 585 586 587 588
        



### some data used for test
w1 = Window(1)
589
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
590
w2 = Window(1)
591
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
592
w3 = Window(1)
593
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
594
w4 = Window(1)
595
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
596 597 598


w5 = Window(1)
599
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
600
w6 = Window(1)
601
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
602 603

lw1 = ListWindows()
604
lw1.d["timestamp"] = 'ts'
605
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
606 607
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
608 609

w7 = Window(1)
610
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
611
w8 = Window(1)
612
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
613 614

lw2 = ListWindows()
615
lw2.d["timestamp"] = 'ts'
616
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
617 618
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
619 620 621 622 623 624

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
625
    print("#", ' '.join(sys.argv))
626 627 628 629 630 631 632

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
633
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
634 635 636 637 638 639
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
640
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
641
    
642
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
643
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
644

645
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
646 647 648 649 650 651 652 653 654 655 656 657 658 659
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
660 661
    print("### fuse.py -- " + DESCRIPTION)
    print()
662

Marc Duez's avatar
Marc Duez committed
663 664 665 666 667 668 669 670
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
671
    if args.multi:
672 673
        for path_name in args.file:
            jlist = ListWindows()
674
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
675
            jlist.build_stat()
676
            
Mathieu Giraud's avatar
Mathieu Giraud committed
677
            print("\t", jlist, end=' ')
678 679 680 681 682

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
683
                
Mathieu Giraud's avatar
Mathieu Giraud committed
684
            print('\t==> merge to', jlist_fused)
685
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
686
        
687
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
688
        print("### Read and merge input files")
689 690
        for path_name in args.file:
            jlist = ListWindows()
691
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
692 693
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
694

695 696 697 698
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
699
            print("\t", jlist, end=' ')
700 701 702 703 704
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
705
            
Mathieu Giraud's avatar
Mathieu Giraud committed
706
            print('\t==> merge to', jlist_fused)
707

708
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
709 710
        print()
        print("### Select point names")
711
        l = jlist_fused.d["samples"].d["original_names"]
712
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
713 714
        print("  <==", l)
        print("  ==>", ll)
715
        jlist_fused.d["samples"].d["names"] = ll
716
    
Mathieu Giraud's avatar
Mathieu Giraud committed
717
    print()
718
    if not args.multi:
719
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
720 721
    print("\t", jlist_fused) 
    print()
722

723 724 725 726 727
    #compute similarity matrix
    fasta = ""
    for i in range(len(jlist_fused.d["clones"])) :
        fasta += ">>" + str(i) + "\n"
        fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
728
    fasta_file = tempfile.NamedTemporaryFile()
729
    fasta_file.write(fasta)
730
    try:
731
        out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
732 733 734
        jlist_fused.d["similarity"] = json.loads(out)
    except OSError:
        print("! failed: %s" % TOOL_SIMILARITY)
Mathieu Giraud's avatar
Mathieu Giraud committed
735
    print("### Save merged file")
736
    jlist_fused.save_json(args.output)
737
    os.unlink(fasta_file.name)
738 739 740 741 742

    
    
if  __name__ =='__main__':
    main()