fuse.py 23.6 KB
Newer Older
1 2 3 4 5
#!/usr/bin/env python
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7 8 9 10 11 12 13
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
#  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33 34 35
import json
import argparse
import time
import copy
36
import os.path
37
import os
Marc Duez's avatar
Marc Duez committed
38
import datetime
39
import subprocess
40
import tempfile
41
from operator import itemgetter
42
from utils import *
43
from defs import *
44

45
FUSE_VERSION = "vidjil fuse"
46

47 48
TOOL_SIMILARITY = "../algo/tools/similarity"

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105 106 107 108 109 110 111 112 113 114 115 116 117
                    
        #keep other data who don't need to be concat
        if other.d["top"] < self.d["top"] :
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118
    def latex(self, point=0, base=10000, tag=''):
119 120
        reads = self.d["reads"][point]
        ratio = float(reads)/base
121 122 123
        return r"   &   & %7d & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio * 100,
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
124

125 126
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
127
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
128
        
Marc Duez's avatar
Marc Duez committed
129
class Samples: 
130

Marc Duez's avatar
Marc Duez committed
131 132
    def __init__(self):
        self.d={}
133
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
134 135 136 137
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
138

139 140 141 142 143
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

144
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
145 146
        
        return obj
147

148 149 150
    def __str__(self):
        return "<Samples: %s>" % self.d

151 152 153 154 155 156 157
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
158
        self.d['distribution'] = {}
159 160 161

    def __add__(self, other):
        obj=Reads()
162 163 164 165 166

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
167 168 169 170
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
171 172 173 174
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
175 176 177

    def __str__(self):
        return "<Reads: %s>" % self.d
178
        
179 180 181 182
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

183 184
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
185 186 187 188 189 190 191 192 193 194
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

195
        for i, s in enumerate(window.d["reads"]):
196 197 198 199 200
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
201 202
            
        # print window, '-->', self.sizes
203 204 205 206 207 208 209
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
210
            w.d['reads'] = self.sizes[r]
211 212 213
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
214
            print('  --[others]-->', w)
215
            yield w
216
        
217
class ListWindows(VidjilJson):
218 219 220
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
221
    <ListWindows: [25] 2>
222 223 224 225
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
226
    <ListWindows: [34] 2>
227 228 229
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
230
    >>> lw3 = lw1 + lw2
231
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
232
    <ListWindows: [25, 34] 3>
233 234 235
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
236 237 238 239 240 241 242 243 244 245
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

246 247 248 249 250
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
251 252
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
253 254
        self.d["clones"] = []
        self.d["clusters"] = []
255
        self.d["germlines"] = {}
256
        
257 258 259 260
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
261
    def __str__(self):
262 263 264 265 266 267 268 269 270 271 272 273 274 275
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
276 277 278

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
279
        print(self)
280
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
281
            print(clone)
282
        
283 284
    ### compute statistics about clones
    def build_stat(self):
285
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
286
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
287

288 289
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
290
                for r in range(len(ranges)):
291
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
292 293 294 295
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
296 297
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
298 299 300 301
            
        #TODO V/D/J distrib and more 
        
        
302 303 304
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
305
        print("==>", output)
306 307
        with open(output, "w") as file:
            json.dump(self, file, indent=2, default=self.toJson)
308 309 310 311 312 313 314 315

    def load(self, file_path, *args, **kwargs):
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
        
    def load_vidjil(self, file_path, pipeline, verbose=True):
316 317 318 319 320
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''

        # name = file_path.split("/")[-1]
        extension = file_path.split('.')[-1]
321 322

        if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
323
            print("<==", file_path, "\t", end=' ')
324
        
325
        with open(file_path, "r") as file:
326 327 328 329
                tmp = json.load(file, object_hook=self.toPython)     
                self.d=tmp.d
                self.check_version(file_path)
        
330 331
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
332 333 334
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
335
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
336
                print("[%s]" % f)
337 338 339

        else:
            f = file_path
340
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
341
                print()
342
        
Marc Duez's avatar
Marc Duez committed
343
        time = os.path.getmtime(file_path)
344
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
345

Marc Duez's avatar
Marc Duez committed
346 347 348
    def getTop(self, top):
        result = []
        
349 350 351
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
352 353 354 355 356
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
357
        reverseList = {}
358 359
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
360 361 362
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
363
        
Marc Duez's avatar
Marc Duez committed
364
        for i in filteredList:
365
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
366
        
367
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
368
        
369 370 371 372
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
373 374
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
375

376 377 378
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
379 380
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
381
        
382
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
383 384
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
385
        
386 387
        return obj
        
388 389 390 391 392 393
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
394
        self.d["clones"] += other.d["clones"]
395 396 397 398 399 400
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
401
    ### 
402
    def fuseWindows(self, w1, w2, l1, l2) :
403
        #store data in dict with "id" as key
404 405
        dico1 = {}
        for i in range(len(w1)) :
406
            dico1[ w1[i].d["id"] ] = w1[i]
407 408 409

        dico2 = {}
        for i in range(len(w2)) :
410
            dico2[ w2[i].d["id"] ] = w2[i]
411

412
        #concat data with the same key ("id")
413 414 415 416 417 418
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
419
                w=Window(l2)
420 421 422 423
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
424
                w=Window(l1)
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
442 443
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
444

445
        length = len(self)
446
        w=[]
447 448 449

        others = OtherWindows(nb_points)

450 451 452
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
453
            #else:
454
                #others += clone
455

456
        self.d["clones"] = w #+ list(others) 
457

Mathieu Giraud's avatar
Mathieu Giraud committed
458
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
459 460
        return self
        
461
    def load_clntab(self, file_path, *args, **kwargs):
462 463 464
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
465
        self.d["samples"].d["original_names"] = [file_path]
466
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
482 483
                w.d["seg"] = {}
                
484
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
485
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
486 487
                s = int(tab["sequence.size"])
                total_size += s
488
                w.d["reads"] = [ s ]
489

490 491
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
492
                w.d["sequence"] = tab["sequence.raw nt seq"]
493
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
494
                if (tab["sequence.D-GENE and allele"] != "") :
495 496
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
497 498 499 500

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
501
                
502
                if position >= 0:
503 504
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
505
                else:
506 507 508 509 510 511
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
512
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
513
                    
514
                listw.append((w , w.d["reads"][0]))
515 516

                raw_clonotype = tab.get("clonotype")
517 518
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
535
            self.d["clones"].append(listw[index][0])
536
        
537 538
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
539 540 541 542

        
    def toJson(self, obj):
        '''Serializer for json module'''
543
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads):
544 545 546
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
547
                
548 549
            return result
            raise TypeError(repr(obj) + " fail !") 
550
        
Marc Duez's avatar
Marc Duez committed
551
        else:
552 553 554 555 556 557
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
558 559 560

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
561
        if "samples" in obj_dict:
562
            obj = ListWindows()
563
            obj.d=obj_dict
564 565
            return obj

566
        if "id" in obj_dict:
567
            obj = Window(1)
568
            obj.d=obj_dict
569
            return obj
Marc Duez's avatar
Marc Duez committed
570
        
571 572
        if "total" in obj_dict:
            obj = Reads()
573
            obj.d=obj_dict
574 575
            return obj
            
Marc Duez's avatar
Marc Duez committed
576 577
        if "original_names" in obj_dict:
            obj = Samples()
578
            obj.d=obj_dict
579
            return obj
Marc Duez's avatar
Marc Duez committed
580
            
581
        return obj_dict
582 583 584 585 586 587
        



### some data used for test
w1 = Window(1)
588
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
589
w2 = Window(1)
590
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
591
w3 = Window(1)
592
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
593
w4 = Window(1)
594
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
595 596 597


w5 = Window(1)
598
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
599
w6 = Window(1)
600
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
601 602

lw1 = ListWindows()
603
lw1.d["timestamp"] = 'ts'
604
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
605 606
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
607 608

w7 = Window(1)
609
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
610
w8 = Window(1)
611
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
612 613

lw2 = ListWindows()
614
lw2.d["timestamp"] = 'ts'
615
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
616 617
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
618 619 620 621 622 623

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
624
    print("#", ' '.join(sys.argv))
625 626 627 628 629 630 631

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
632
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
633 634 635 636 637 638
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
639
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
640
    
641
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
642
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
643

644
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
645 646 647 648 649 650 651 652 653 654 655 656 657 658
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
659 660
    print("### fuse.py -- " + DESCRIPTION)
    print()
661

Marc Duez's avatar
Marc Duez committed
662 663 664 665 666 667 668 669
    #filtre
    f = []
    for path_name in args.file:
        jlist = ListWindows()
        jlist.load(path_name, args.pipeline)
        f += jlist.getTop(args.top)
    f = sorted(set(f))
    
670
    if args.multi:
671 672
        for path_name in args.file:
            jlist = ListWindows()
673
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
674
            jlist.build_stat()
675
            
Mathieu Giraud's avatar
Mathieu Giraud committed
676
            print("\t", jlist, end=' ')
677 678 679 680 681

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
682
                
Mathieu Giraud's avatar
Mathieu Giraud committed
683
            print('\t==> merge to', jlist_fused)
684
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
685
        
686
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
687
        print("### Read and merge input files")
688 689
        for path_name in args.file:
            jlist = ListWindows()
690
            jlist.load(path_name, args.pipeline)
Marc Duez's avatar
Marc Duez committed
691 692
            jlist.build_stat()
            jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
693

694 695 696 697
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
698
            print("\t", jlist, end=' ')
699 700 701 702 703
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
704
            
Mathieu Giraud's avatar
Mathieu Giraud committed
705
            print('\t==> merge to', jlist_fused)
706

707
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
708 709
        print()
        print("### Select point names")
710
        l = jlist_fused.d["samples"].d["original_names"]
711
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
712 713
        print("  <==", l)
        print("  ==>", ll)
714
        jlist_fused.d["samples"].d["names"] = ll
715
    
Mathieu Giraud's avatar
Mathieu Giraud committed
716
    print()
717
    if not args.multi:
718
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
719 720
    print("\t", jlist_fused) 
    print()
721

722 723 724 725 726
    #compute similarity matrix
    fasta = ""
    for i in range(len(jlist_fused.d["clones"])) :
        fasta += ">>" + str(i) + "\n"
        fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
727
    fasta_file = tempfile.NamedTemporaryFile()
728
    fasta_file.write(fasta)
729
    try:
730
        out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
731 732 733
        jlist_fused.d["similarity"] = json.loads(out)
    except OSError:
        print("! failed: %s" % TOOL_SIMILARITY)
Mathieu Giraud's avatar
Mathieu Giraud committed
734
    print("### Save merged file")
735
    jlist_fused.save_json(args.output)
736
    os.unlink(fasta_file.name)
737 738 739 740 741

    
    
if  __name__ =='__main__':
    main()