fuse.py 26.9 KB
Newer Older
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33
import json
import argparse
34
import os.path
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43

44
FUSE_VERSION = "vidjil fuse"
45

46
TOOL_SIMILARITY = "../algo/tools/similarity"
47
SIMILARITY_LIMIT = 100
48

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
105 106
                    
        #keep other data who don't need to be concat
107
        if other.d["top"] < self.d["top"] :
108 109 110 111 112 113 114 115 116 117
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
118 119
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
120

121
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
122
        reads = self.d["reads"][point]
123
        ratio_germline = float(reads)/base_germline
124
        ratio = float(reads)/base
125
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
126 127
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
128

129 130
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
131
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
132
        
133
class Samples: 
134

135 136
    def __init__(self):
        self.d={}
137
        self.d["number"] = 1
138 139 140 141
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
142

143 144 145 146 147
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

148
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
149 150
        
        return obj
151

152 153 154
    def __str__(self):
        return "<Samples: %s>" % self.d

155 156
class Diversity: 

157 158
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

159 160 161
    def __init__(self, data=None):
        self.d={}

162 163 164 165 166
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
167

168 169 170
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
171 172 173 174 175
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
176 177 178 179 180 181 182
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
183
        self.d['distribution'] = {}
184 185 186

    def __add__(self, other):
        obj=Reads()
187 188 189 190 191

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
192 193 194 195
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
196 197 198 199
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
200 201 202

    def __str__(self):
        return "<Reads: %s>" % self.d
203
        
204 205 206 207
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

208 209
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
210 211 212 213 214 215 216 217 218 219
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

220
        for i, s in enumerate(window.d["reads"]):
221 222 223 224 225
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
226 227
            
        # print window, '-->', self.sizes
228 229 230 231 232 233 234
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
235
            w.d['reads'] = self.sizes[r]
236 237 238
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
239
            print('  --[others]-->', w)
240
            yield w
241
        
242
class ListWindows(VidjilJson):
243 244 245
    '''storage class for sequences informations 
    
    >>> lw1.info()
246
    <ListWindows: [25] 2>
247 248 249 250
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
251
    <ListWindows: [34] 2>
252 253 254
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
255
    >>> lw3 = lw1 + lw2
256
    >>> lw3.info()
257
    <ListWindows: [25, 34] 3>
258 259 260
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
261 262 263 264 265 266 267 268 269 270
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

271 272 273 274 275
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
276 277
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
278 279
        self.d["clones"] = []
        self.d["clusters"] = []
280
        self.d["germlines"] = {}
281
        
282 283 284 285
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
286
    def __str__(self):
287 288 289 290 291 292 293 294 295 296 297 298 299 300
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
301 302 303

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
304
        print(self)
305
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
306
            print(clone)
307
        
308 309
    ### compute statistics about clones
    def build_stat(self):
310
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
311
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
312

313 314
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
315
                for r in range(len(ranges)):
316
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
317 318 319 320
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
321 322
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
323 324 325 326
            
        #TODO V/D/J distrib and more 
        
        
327 328 329
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
330
        print("==>", output)
331 332
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
333

334
    def load(self, file_path, *args, **kwargs):
335 336 337 338
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
339 340 341 342 343 344

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
345 346 347
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
348

349 350 351 352
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
353
        
354 355
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

380 381 382
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
383
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
384
                print("[%s]" % f)
385 386 387

        else:
            f = file_path
388
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
389
                print()
390

391
        time = os.path.getmtime(file_path)
392
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
393

394 395 396
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
397

Marc Duez's avatar
Marc Duez committed
398 399 400
    def getTop(self, top):
        result = []
        
401 402 403
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
404 405 406 407 408
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
409
        reverseList = {}
410 411
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
412 413 414
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
415
        
416
        for i in filteredList:
417
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
418
        
419
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
420
        
421 422 423 424
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
425 426
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
427

428 429 430
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
431 432
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
433
        
434
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
435 436
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
437
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
438
        
439 440
        return obj
        
441 442 443 444 445 446
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
447
        self.d["clones"] += other.d["clones"]
448 449 450 451 452 453
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
454
    ### 
455
    def fuseWindows(self, w1, w2, l1, l2) :
456
        #store data in dict with "id" as key
457 458
        dico1 = {}
        for i in range(len(w1)) :
459
            dico1[ w1[i].d["id"] ] = w1[i]
460 461 462

        dico2 = {}
        for i in range(len(w2)) :
463
            dico2[ w2[i].d["id"] ] = w2[i]
464

465
        #concat data with the same key ("id")
466 467 468 469 470 471
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
472
                w=Window(l2)
473 474 475 476
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
477
                w=Window(l1)
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
495 496
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
497 498

        w=[]
499 500 501

        others = OtherWindows(nb_points)

502 503 504
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
505
            #else:
506
                #others += clone
507

508
        self.d["clones"] = w #+ list(others) 
509

Mathieu Giraud's avatar
Mathieu Giraud committed
510
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
511 512
        return self
        
513
    def load_clntab(self, file_path, *args, **kwargs):
514 515 516
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
517
        self.d["samples"].d["original_names"] = [file_path]
518
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
534 535
                w.d["seg"] = {}
                
536
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
537
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
538 539
                s = int(tab["sequence.size"])
                total_size += s
540
                w.d["reads"] = [ s ]
541

542 543
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
544
                w.d["sequence"] = tab["sequence.raw nt seq"]
545
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
546
                if (tab["sequence.D-GENE and allele"] != "") :
547 548
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
549 550 551 552

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
553
                
554
                if position >= 0:
555 556
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
557
                else:
558 559 560 561 562 563
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
564
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
565
                    
566
                listw.append((w , w.d["reads"][0]))
567 568

                raw_clonotype = tab.get("clonotype")
569 570
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
587
            self.d["clones"].append(listw[index][0])
588
        
589 590
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
591 592 593 594

        
    def toJson(self, obj):
        '''Serializer for json module'''
595
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
596 597 598
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
599
                
600 601
            return result
            raise TypeError(repr(obj) + " fail !") 
602
        
603
        else:
604 605 606 607 608 609
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
610 611 612

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
613
        if "samples" in obj_dict:
614
            obj = ListWindows()
615
            obj.d=obj_dict
616 617
            return obj

618
        if "id" in obj_dict:
619
            obj = Window(1)
620
            obj.d=obj_dict
621
            return obj
622
        
623 624
        if "total" in obj_dict:
            obj = Reads()
625
            obj.d=obj_dict
626 627
            return obj
            
628 629
        if "original_names" in obj_dict:
            obj = Samples()
630
            obj.d=obj_dict
631
            return obj
632
            
633
        return obj_dict
634 635 636 637 638 639
        



### some data used for test
w1 = Window(1)
640
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
641
w2 = Window(1)
642
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
643
w3 = Window(1)
644
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
645
w4 = Window(1)
646
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
647 648 649


w5 = Window(1)
650
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
651
w6 = Window(1)
652
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
653 654

lw1 = ListWindows()
655
lw1.d["timestamp"] = 'ts'
656
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
657 658
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
659 660
lw1.d["diversity"] = Diversity()

661 662

w7 = Window(1)
663
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
664
w8 = Window(1)
665
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
666 667

lw2 = ListWindows()
668
lw2.d["timestamp"] = 'ts'
669
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
670 671
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
672
lw2.d["diversity"] = Diversity()
673 674 675 676 677 678

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
679
    print("#", ' '.join(sys.argv))
680 681 682 683 684 685 686

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
687
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
688 689 690 691 692 693
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
694
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
695
    
696
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
697
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
698
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
699

700
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
701 702
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

703
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
704

705 706 707 708 709 710 711 712 713 714 715 716
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
717 718
    print("### fuse.py -- " + DESCRIPTION)
    print()
719

720 721 722 723 724 725
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

Marc Duez's avatar
Marc Duez committed
726 727
    #filtre
    f = []
728

729 730 731 732 733
    if args.ijson:
        from vidjilparser import VidjilParser
        vparser = VidjilParser()
        vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)

734
    for path_name in files:
735 736 737
        if args.ijson:
            json_clones = vparser.extract(path_name)
            clones = json.loads(json_clones)
738 739
            if clones["clones"] is not None:
                f += [c['id'] for c in clones["clones"]]
740 741 742 743 744
        else:
            jlist = ListWindows()
            jlist.load(path_name, args.pipeline)
            f += jlist.getTop(args.top)

Marc Duez's avatar
Marc Duez committed
745 746
    f = sorted(set(f))
    
747 748 749 750 751
    if args.ijson:
        vparser.reset()
        vparser.addPrefix('')
        vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)

752
    if args.multi:
753
        for path_name in files:
754
            jlist = ListWindows()
755 756 757 758 759 760
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
761
            
Mathieu Giraud's avatar
Mathieu Giraud committed
762
            print("\t", jlist, end=' ')
763 764 765 766 767

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
768
                
Mathieu Giraud's avatar
Mathieu Giraud committed
769
            print('\t==> merge to', jlist_fused)
770
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
771
        
772
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
773
        print("### Read and merge input files")
774
        for path_name in files:
775
            jlist = ListWindows()
776 777 778 779 780 781 782
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
                jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
783

784 785 786 787
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
788
            print("\t", jlist, end=' ')
789 790 791 792 793
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
794
            
Mathieu Giraud's avatar
Mathieu Giraud committed
795
            print('\t==> merge to', jlist_fused)
796

797
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
798 799
        print()
        print("### Select point names")
800
        l = jlist_fused.d["samples"].d["original_names"]
801
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
802 803
        print("  <==", l)
        print("  ==>", ll)
804
        jlist_fused.d["samples"].d["names"] = ll
805
    
Mathieu Giraud's avatar
Mathieu Giraud committed
806
    print()
807
    if not args.multi:
808
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
809 810
    print("\t", jlist_fused) 
    print()
811

812
    #compute similarity matrix
813 814 815 816 817
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
818
        fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
819 820 821 822 823 824
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
825 826
        finally:
            os.unlink(fasta_file.name)
827 828
    else : 
        jlist_fused.d["similarity"] = [];
829
        
Mathieu Giraud's avatar
Mathieu Giraud committed
830
    print("### Save merged file")
831
    jlist_fused.save_json(args.output)
832
    
833 834 835 836
    
    
if  __name__ =='__main__':
    main()