fuse.py 27 KB
Newer Older
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33
import json
import argparse
34
import os.path
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43

44
FUSE_VERSION = "vidjil fuse"
45

46
TOOL_SIMILARITY = "../algo/tools/similarity"
47
SIMILARITY_LIMIT = 100
48

49 50
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

51 52 53
####

class Window:
54
    # Should be renamed "Clone"
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
78
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
79
        self.d["top"] = sys.maxsize
80
        self.d["reads"] = []
81
        for i in range(size):
82
            self.d["reads"].append(0)
83 84 85
        
        
    ### 
86 87
    def __iadd__(self, other):
        ### Not used now
88
        """Add other.reads to self.reads in-place, without extending lists"""
89

90 91
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
92 93 94

        return self
        
95
    def __add__(self, other):
96
        """Concat two windows, extending lists such as 'reads'"""
97
        #data we don't need to duplicate
98
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
99 100
        obj = Window(1)
        
101 102 103 104 105
        # 'id' and 'top' will be taken from 'topmost' clone
        del obj.d["id"]
        del obj.d["top"]

        # Data of type 'list'
106 107 108 109
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
110
                    
111 112 113 114 115 116 117 118 119
        # All other data, including 'top'
        # When there are conflicting keys, keep data from the 'topmost' clone
        order = [other, self] if other.d["top"] < self.d["top"] else [self, other]

        for source in order:
            for key in source.d:
                if key not in obj.d:
                    obj.d[key] = source.d[key]

120 121
        return obj
        
122 123
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
124

125
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
126
        reads = self.d["reads"][point]
127
        ratio_germline = float(reads)/base_germline
128
        ratio = float(reads)/base
129
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
130 131
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
132

133 134
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
135
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
136
        
137
class Samples: 
138

139 140
    def __init__(self):
        self.d={}
141
        self.d["number"] = 1
142 143 144 145
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
146

147 148 149 150 151
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

152
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
153 154
        
        return obj
155

156 157 158
    def __str__(self):
        return "<Samples: %s>" % self.d

159 160
class Diversity: 

161 162
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

163 164 165
    def __init__(self, data=None):
        self.d={}

166 167 168 169 170
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
171

172 173 174
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
175 176 177 178 179
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
180 181 182 183 184 185 186
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
187
        self.d['distribution'] = {}
188 189 190

    def __add__(self, other):
        obj=Reads()
191 192 193 194 195

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
196 197 198 199
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
200 201 202 203
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
204 205 206

    def __str__(self):
        return "<Reads: %s>" % self.d
207
        
208 209 210 211
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

212 213
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
214 215 216 217 218 219 220 221 222 223
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

224
        for i, s in enumerate(window.d["reads"]):
225 226 227 228 229
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
230 231
            
        # print window, '-->', self.sizes
232 233 234 235 236 237 238
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
239
            w.d['reads'] = self.sizes[r]
240 241 242
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
243
            print('  --[others]-->', w)
244
            yield w
245
        
246
class ListWindows(VidjilJson):
247 248 249
    '''storage class for sequences informations 
    
    >>> lw1.info()
250
    <ListWindows: [25] 2>
251 252 253 254
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
255
    <ListWindows: [34] 2>
256 257 258
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
259
    >>> lw3 = lw1 + lw2
260
    >>> lw3.info()
261
    <ListWindows: [25, 34] 3>
262 263 264
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
265 266 267 268 269 270 271 272 273 274
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

275 276 277 278 279
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
280 281
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
282 283
        self.d["clones"] = []
        self.d["clusters"] = []
284
        self.d["germlines"] = {}
285
        
286 287 288 289
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
290
    def __str__(self):
291 292 293 294 295 296 297 298 299 300 301 302 303 304
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
305 306 307

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
308
        print(self)
309
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
310
            print(clone)
311
        
312 313
    ### compute statistics about clones
    def build_stat(self):
314
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
315
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
316

317 318
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
319
                for r in range(len(ranges)):
320
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
321 322 323 324
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
325 326
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
327 328 329 330
            
        #TODO V/D/J distrib and more 
        
        
331 332 333
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
334
        print("==>", output)
335 336
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
337

338
    def load(self, file_path, *args, **kwargs):
339 340 341 342
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
343 344 345 346 347 348

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
349 350 351
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
352

353 354 355 356
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
357
        
358 359
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

384 385 386
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
387
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
388
                print("[%s]" % f)
389 390 391

        else:
            f = file_path
392
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
393
                print()
394

395
        time = os.path.getmtime(file_path)
396
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
397

398 399 400
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
401

Marc Duez's avatar
Marc Duez committed
402 403 404
    def getTop(self, top):
        result = []
        
405 406 407
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
408 409 410 411 412
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
413
        reverseList = {}
414 415
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
416 417 418
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
419
        
420
        for i in filteredList:
421
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
422
        
423
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
424
        
425 426 427 428
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
429 430
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
431

432 433 434
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
435 436
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
437
        
438
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
439 440
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
441
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
442
        
443 444
        return obj
        
445 446 447 448 449 450
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
451
        self.d["clones"] += other.d["clones"]
452 453 454 455 456 457
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
458
    ### 
459
    def fuseWindows(self, w1, w2, l1, l2) :
460
        #store data in dict with "id" as key
461 462
        dico1 = {}
        for i in range(len(w1)) :
463
            dico1[ w1[i].d["id"] ] = w1[i]
464 465 466

        dico2 = {}
        for i in range(len(w2)) :
467
            dico2[ w2[i].d["id"] ] = w2[i]
468

469
        #concat data with the same key ("id")
470 471 472 473 474 475
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
476
                w=Window(l2)
477 478 479 480
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
481
                w=Window(l1)
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
499 500
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
501 502

        w=[]
503 504 505

        others = OtherWindows(nb_points)

506 507 508
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
509
            #else:
510
                #others += clone
511

512
        self.d["clones"] = w #+ list(others) 
513

Mathieu Giraud's avatar
Mathieu Giraud committed
514
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
515 516
        return self
        
517
    def load_clntab(self, file_path, *args, **kwargs):
518 519 520
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
521
        self.d["samples"].d["original_names"] = [file_path]
522
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
538 539
                w.d["seg"] = {}
                
540
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
541
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
542 543
                s = int(tab["sequence.size"])
                total_size += s
544
                w.d["reads"] = [ s ]
545

546 547
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
548
                w.d["sequence"] = tab["sequence.raw nt seq"]
549
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
550
                if (tab["sequence.D-GENE and allele"] != "") :
551 552
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
553 554 555 556

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
557
                
558
                if position >= 0:
559 560
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
561
                else:
562 563 564 565 566 567
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
568
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
569
                    
570
                listw.append((w , w.d["reads"][0]))
571 572

                raw_clonotype = tab.get("clonotype")
573 574
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
591
            self.d["clones"].append(listw[index][0])
592
        
593 594
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
595 596 597 598

        
    def toJson(self, obj):
        '''Serializer for json module'''
599
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
600 601 602
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
603
                
604 605
            return result
            raise TypeError(repr(obj) + " fail !") 
606
        
607
        else:
608 609 610 611 612 613
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
614 615 616

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
617
        if "samples" in obj_dict:
618
            obj = ListWindows()
619
            obj.d=obj_dict
620 621
            return obj

622
        if "id" in obj_dict:
623
            obj = Window(1)
624
            obj.d=obj_dict
625
            return obj
626
        
627 628
        if "total" in obj_dict:
            obj = Reads()
629
            obj.d=obj_dict
630 631
            return obj
            
632 633
        if "original_names" in obj_dict:
            obj = Samples()
634
            obj.d=obj_dict
635
            return obj
636
            
637
        return obj_dict
638 639 640 641 642 643
        



### some data used for test
w1 = Window(1)
644
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
645
w2 = Window(1)
646
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
647
w3 = Window(1)
648
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
649
w4 = Window(1)
650
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
651 652 653


w5 = Window(1)
654
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
655
w6 = Window(1)
656
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
657 658

lw1 = ListWindows()
659
lw1.d["timestamp"] = 'ts'
660
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
661 662
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
663 664
lw1.d["diversity"] = Diversity()

665 666

w7 = Window(1)
667
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
668
w8 = Window(1)
669
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
670 671

lw2 = ListWindows()
672
lw2.d["timestamp"] = 'ts'
673
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
674 675
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
676
lw2.d["diversity"] = Diversity()
677 678 679 680 681 682

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
683
    print("#", ' '.join(sys.argv))
684 685 686 687 688 689 690

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
691
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
692 693 694 695 696 697
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
698
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
699
    
700
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
701
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
702
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
703

704
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
705 706
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

707
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
708

709 710 711 712 713 714 715 716 717 718 719 720
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
721 722
    print("### fuse.py -- " + DESCRIPTION)
    print()
723

724 725 726 727 728 729
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

Marc Duez's avatar
Marc Duez committed
730 731
    #filtre
    f = []
732

733 734 735 736 737
    if args.ijson:
        from vidjilparser import VidjilParser
        vparser = VidjilParser()
        vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)

738
    for path_name in files:
739 740 741
        if args.ijson:
            json_clones = vparser.extract(path_name)
            clones = json.loads(json_clones)
742 743
            if clones["clones"] is not None:
                f += [c['id'] for c in clones["clones"]]
744 745 746 747 748
        else:
            jlist = ListWindows()
            jlist.load(path_name, args.pipeline)
            f += jlist.getTop(args.top)

Marc Duez's avatar
Marc Duez committed
749 750
    f = sorted(set(f))
    
751 752 753 754 755
    if args.ijson:
        vparser.reset()
        vparser.addPrefix('')
        vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)

756
    if args.multi:
757
        for path_name in files:
758
            jlist = ListWindows()
759 760 761 762 763 764
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
765
            
Mathieu Giraud's avatar
Mathieu Giraud committed
766
            print("\t", jlist, end=' ')
767 768 769 770 771

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
772
                
Mathieu Giraud's avatar
Mathieu Giraud committed
773
            print('\t==> merge to', jlist_fused)
774
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
775
        
776
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
777
        print("### Read and merge input files")
778
        for path_name in files:
779
            jlist = ListWindows()
780 781 782 783 784 785 786
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
                jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
787

788 789 790 791
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
792
            print("\t", jlist, end=' ')
793 794 795 796 797
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
798
            
Mathieu Giraud's avatar
Mathieu Giraud committed
799
            print('\t==> merge to', jlist_fused)
800

801
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
802 803
        print()
        print("### Select point names")
804
        l = jlist_fused.d["samples"].d["original_names"]
805
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
806 807
        print("  <==", l)
        print("  ==>", ll)
808
        jlist_fused.d["samples"].d["names"] = ll
809
    
Mathieu Giraud's avatar
Mathieu Giraud committed
810
    print()
811
    if not args.multi:
812
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
813 814
    print("\t", jlist_fused) 
    print()
815

816
    #compute similarity matrix
817 818 819 820 821
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
822
        fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
823 824 825 826 827 828
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
829 830
        finally:
            os.unlink(fasta_file.name)
831 832
    else : 
        jlist_fused.d["similarity"] = [];
833
        
Mathieu Giraud's avatar
Mathieu Giraud committed
834
    print("### Save merged file")
835
    jlist_fused.save_json(args.output)
836
    
837 838 839 840
    
    
if  __name__ =='__main__':
    main()