fuse.py 23.5 KB
Newer Older
1 2 3 4 5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7 8 9 10 11 12 13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33 34 35
import json
import argparse
import time
import copy
36
import os.path
37
import os
Marc Duez's avatar
Marc Duez committed
38
import datetime
39
import subprocess
40
import tempfile
41
from operator import itemgetter
42
from utils import *
43
from defs import *
44

45
FUSE_VERSION = "vidjil fuse"
46

47 48
TOOL_SIMILARITY = "../algo/tools/similarity"

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105 106 107 108 109 110 111 112 113 114 115 116 117
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118 119 120
    def latex(self, point=0, base=10000):
        reads = self.d["reads"][point]
        ratio = float(reads)/base
121
        return r"   &   & %7d & %5.2f\%% & %-50s \\ %% %s" % (reads, ratio * 100,
122
                                                           self.d["name"] if 'name' in self.d else self.d["id"], self.d["id"])
123

124 125
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
126
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
127
        
128
class Samples: 
129

130 131
    def __init__(self):
        self.d={}
132
        self.d["number"] = 1
133 134 135 136
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
137

138 139 140 141 142
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

143
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
144 145
        
        return obj
146

147 148 149
    def __str__(self):
        return "<Samples: %s>" % self.d

150 151 152 153 154 155 156
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
157
        self.d['distribution'] = {}
158 159 160

    def __add__(self, other):
        obj=Reads()
161 162 163 164 165

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
166 167 168 169
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
170 171 172 173
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
174 175 176

    def __str__(self):
        return "<Reads: %s>" % self.d
177
        
178 179 180 181
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

182 183
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
184 185 186 187 188 189 190 191 192 193
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

194
        for i, s in enumerate(window.d["reads"]):
195 196 197 198 199
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
200 201
            
        # print window, '-->', self.sizes
202 203 204 205 206 207 208
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
209
            w.d['reads'] = self.sizes[r]
210 211 212
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
213
            print('  --[others]-->', w)
214
            yield w
215
        
216
class ListWindows(VidjilJson):
217 218 219
    '''storage class for sequences informations 
    
    >>> lw1.info()
220
    <ListWindows: [25] 2>
221 222 223 224
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
225
    <ListWindows: [34] 2>
226 227 228
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
229
    >>> lw3 = lw1 + lw2
230
    >>> lw3.info()
231
    <ListWindows: [25, 34] 3>
232 233 234
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
235 236 237 238 239 240 241 242 243 244
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

245 246 247 248 249
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
250 251
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
252 253
        self.d["clones"] = []
        self.d["clusters"] = []
254
        self.d["germlines"] = {}
255
        
256 257 258 259
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
260
    def __str__(self):
261 262 263 264 265 266 267 268 269 270 271 272 273 274
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
275 276 277

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
278
        print(self)
279
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
280
            print(clone)
281
        
282 283
    ### compute statistics about clones
    def build_stat(self):
284
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
285
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
286

287 288
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
289
                for r in range(len(ranges)):
290
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
291 292 293 294
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
295 296
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
297 298 299 300
            
        #TODO V/D/J distrib and more 
        
        
301 302 303
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
304
        print("==>", output)
305 306
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
307 308 309 310 311 312 313 314

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
315 316 317 318 319
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
320 321

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
322
            print("<==", file_path, "\t", end=' ')
323
        
324
        with open(file_path, "r") as file:
325 326 327 328
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
329 330
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
331 332 333
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
334
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
335
                print("[%s]" % f)
336 337 338

        else:
            f = file_path
339
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
340
                print()
341
        
342
        time = os.path.getmtime(file_path)
343
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
344

Marc Duez's avatar
Marc Duez committed
345 346 347
    def getTop(self, top):
        result = []
        
348 349 350
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
351 352 353 354 355
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
356
        reverseList = {}
357 358
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
359 360 361
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
362
        
363
        for i in filteredList:
364
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
365
        
366
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
367
        
368 369 370 371
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
372 373
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
374

375 376 377
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
378 379
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
380
        
381
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
382 383
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
384
        
385 386
        return obj
        
387 388 389 390 391 392
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
393
        self.d["clones"] += other.d["clones"]
394 395 396 397 398 399
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
400
    ### 
401
    def fuseWindows(self, w1, w2, l1, l2) :
402
        #store data in dict with "id" as key
403 404
        dico1 = {}
        for i in range(len(w1)) :
405
            dico1[ w1[i].d["id"] ] = w1[i]
406 407 408

        dico2 = {}
        for i in range(len(w2)) :
409
            dico2[ w2[i].d["id"] ] = w2[i]
410

411
        #concat data with the same key ("id")
412 413 414 415 416 417
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
418
                w=Window(l2)
419 420 421 422
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
423
                w=Window(l1)
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
441 442
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
443

444
        length = len(self)
445
        w=[]
446 447 448

        others = OtherWindows(nb_points)

449 450 451
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
452
            #else:
453
                #others += clone
454

455
        self.d["clones"] = w #+ list(others) 
456

Mathieu Giraud's avatar
Mathieu Giraud committed
457
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
458 459
        return self
        
460
    def load_clntab(self, file_path, *args, **kwargs):
461 462 463
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
464
        self.d["samples"].d["original_names"] = [file_path]
465
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
481 482
                w.d["seg"] = {}
                
483
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
484
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
485 486
                s = int(tab["sequence.size"])
                total_size += s
487
                w.d["reads"] = [ s ]
488

489 490
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
491
                w.d["sequence"] = tab["sequence.raw nt seq"]
492
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
493
                if (tab["sequence.D-GENE and allele"] != "") :
494 495
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
496 497 498 499

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
500
                
501
                if position >= 0:
502 503
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
504
                else:
505 506 507 508 509 510
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
511
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
512
                    
513
                listw.append((w , w.d["reads"][0]))
514 515

                raw_clonotype = tab.get("clonotype")
516 517
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
534
            self.d["clones"].append(listw[index][0])
535
        
536 537
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
538 539 540 541

        
    def toJson(self, obj):
        '''Serializer for json module'''
542
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
543 544 545
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
546
                
547 548
            return result
            raise TypeError(repr(obj) + " fail !") 
549
        
550
        else:
551 552 553 554 555 556
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
557 558 559

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
560
        if "samples" in obj_dict:
561
            obj = ListWindows()
562
            obj.d=obj_dict
563 564
            return obj

565
        if "id" in obj_dict:
566
            obj = Window(1)
567
            obj.d=obj_dict
568
            return obj
569
        
570 571
        if "total" in obj_dict:
            obj = Reads()
572
            obj.d=obj_dict
573 574
            return obj
            
575 576
        if "original_names" in obj_dict:
            obj = Samples()
577
            obj.d=obj_dict
578
            return obj
579
            
580
        return obj_dict
581 582 583 584 585 586
        



### some data used for test
w1 = Window(1)
587
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
588
w2 = Window(1)
589
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
590
w3 = Window(1)
591
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
592
w4 = Window(1)
593
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
594 595 596


w5 = Window(1)
597
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
598
w6 = Window(1)
599
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
600 601

lw1 = ListWindows()
602
lw1.d["timestamp"] = 'ts'
603
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
604 605
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
606 607

w7 = Window(1)
608
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
609
w8 = Window(1)
610
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
611 612

lw2 = ListWindows()
613
lw2.d["timestamp"] = 'ts'
614
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
615 616
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
617 618 619 620 621 622

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
623
    print("#", ' '.join(sys.argv))
624 625 626 627 628 629 630

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
631
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
632 633 634 635 636 637
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
638
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
639
    
640
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
641
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
642

643
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
644 645 646 647 648 649 650 651 652 653 654 655 656 657
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
658 659
    print("### fuse.py -- " + DESCRIPTION)
    print()
660

Marc Duez's avatar
Marc Duez committed
661 662 663 664 665 666 667 668
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
669
    if args.multi:
670 671
        for path_name in args.file:
            jlist = ListWindows()
672
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
673
            jlist.build_stat()
674
            
Mathieu Giraud's avatar
Mathieu Giraud committed
675
            print("\t", jlist, end=' ')
676 677 678 679 680

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
681
                
Mathieu Giraud's avatar
Mathieu Giraud committed
682
            print('\t==> merge to', jlist_fused)
683
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
684
        
685
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
686
        print("### Read and merge input files")
687 688
        for path_name in args.file:
            jlist = ListWindows()
689
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
690 691
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
692

693 694 695 696
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
697
            print("\t", jlist, end=' ')
698 699 700 701 702
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
703
            
Mathieu Giraud's avatar
Mathieu Giraud committed
704
            print('\t==> merge to', jlist_fused)
705

706
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
707 708
        print()
        print("### Select point names")
709
        l = jlist_fused.d["samples"].d["original_names"]
710
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
711 712
        print("  <==", l)
        print("  ==>", ll)
713
        jlist_fused.d["samples"].d["names"] = ll
714
    
Mathieu Giraud's avatar
Mathieu Giraud committed
715
    print()
716
    if not args.multi:
717
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
718 719
    print("\t", jlist_fused) 
    print()
720

721 722 723 724 725
    #compute similarity matrix
    fasta = ""
    for i in range(len(jlist_fused.d["clones"])) :
        fasta += ">>" + str(i) + "\n"
        fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
726
    fasta_file = tempfile.NamedTemporaryFile()
727
    fasta_file.write(fasta)
728
    try:
729
        out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
730 731 732
        jlist_fused.d["similarity"] = json.loads(out)
    except OSError:
        print("! failed: %s" % TOOL_SIMILARITY)
Mathieu Giraud's avatar
Mathieu Giraud committed
733
    print("### Save merged file")
734
    jlist_fused.save_json(args.output)
735
    os.unlink(fasta_file.name)
736 737 738 739 740

    
    
if  __name__ =='__main__':
    main()