Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

fuse.py 26.8 KB
Newer Older
Ryan Herbert's avatar
Ryan Herbert committed
1
#!/usr/bin/env python3
2 3 4 5
# -*- coding: utf-8 -*-

### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins

6 7
#  This file is part of Vidjil <http://www.vidjil.org>,
#  High-throughput Analysis of V(D)J Immune Repertoire.
8
#  Copyright (C) 2011-2017 by Bonsai bioinformatics
9 10 11 12 13
#  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
#  Contributors: 
#      Marc Duez <marc.duez@vidjil.org>
#      Mathieu Giraud <mathieu.giraud@vidjil.org>
#      The Vidjil Team <contact@vidjil.org>
14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
#  "Vidjil" is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  "Vidjil" is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>

Mathieu Giraud's avatar
Mathieu Giraud committed
28
from __future__ import print_function
29

30
import check_python_version
31
import sys
32 33
import json
import argparse
34
import os.path
Mikaël Salson's avatar
Mikaël Salson committed
35
import os
Marc Duez's avatar
Marc Duez committed
36
import datetime
37
import subprocess
38
import tempfile
39
from operator import itemgetter, le
40
from utils import *
41
from defs import *
42
from collections import defaultdict
43

44 45
from vidjilparser import VidjilParser

46
FUSE_VERSION = "vidjil fuse"
47

48
TOOL_SIMILARITY = "../algo/tools/similarity"
49
SIMILARITY_LIMIT = 100
50

51 52
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL'] 

53 54 55
####

class Window:
56
    # Should be renamed "Clone"
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    '''storage class for sequence informations
    with some function to group sequence informations
    
    >>> str(w1)
    '<window : [5] 3 aaa>'
    
    >>> str(w3)
    '<window : [8] 4 aaa>'
    
    >>> str(w1 + w3)
    '<window : [5, 8] 3 aaa>'
    
    
    check if other information are conserved
    
    >>> (w2 + w4).d["test"]
    [0, 'plop']
    
    '''
    
    ### init Window with the minimum data required
    def __init__(self, size):
        self.d={}
80
        self.d["id"] = ""
Mathieu Giraud's avatar
Mathieu Giraud committed
81
        self.d["top"] = sys.maxsize
82
        self.d["reads"] = []
83
        for i in range(size):
84
            self.d["reads"].append(0)
85 86 87
        
        
    ### 
88 89
    def __iadd__(self, other):
        ### Not used now
90
        """Add other.reads to self.reads in-place, without extending lists"""
91

92 93
        assert(len(self.d['reads']) == len(other.d['reads']))
        self.d['reads'] = [my + her for (my, her) in zip(self.d['reads'], other.d['reads'])]
94 95 96

        return self
        
97
    def __add__(self, other):
98
        """Concat two windows, extending lists such as 'reads'"""
99
        #data we don't need to duplicate
100
        myList = [ "seg", "top", "id", "sequence", "name", "id", "stats", "germline"]
101 102
        obj = Window(1)
        
103 104 105 106
        concatenate_with_padding(obj.d,
                                 self.d, len(self.d["reads"]),
                                 other.d, len(other.d["reads"]),
                                 myList)
107 108
                    
        #keep other data who don't need to be concat
109
        if other.d["top"] < self.d["top"] :
110 111 112 113 114 115 116 117 118 119
            for key in other.d :
                if key in myList :
                    obj.d[key] = other.d[key]
        else :
            for key in self.d :
                if key in myList :
                    obj.d[key] = self.d[key]
        
        return obj
        
120 121
    def get_nb_reads(self, cid, point=0):
        return self[cid]["reads"][point]
122

123
    def latex(self, point=0, base_germline=10000, base=10000, tag=''):
124
        reads = self.d["reads"][point]
125
        ratio_germline = float(reads)/base_germline
126
        ratio = float(reads)/base
127
        return r"   &   & %7d & %5.2f\%% & of %-4s & %5.2f\%% & %-50s \\ %% %4s %s" % (reads, ratio_germline * 100, self.d['germline'], ratio * 100,
128 129
                                                                  self.d["name"] if 'name' in self.d else self.d["id"],
                                                                  tag, self.d["id"])
130

131 132
    ### print essential info about Window
    def __str__(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
133
        return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
134
        
Marc Duez's avatar
Marc Duez committed
135
class Samples: 
136

Marc Duez's avatar
Marc Duez committed
137 138
    def __init__(self):
        self.d={}
139
        self.d["number"] = 1
Marc Duez's avatar
Marc Duez committed
140 141 142 143
        self.d["original_names"] = [""]

    def __add__(self, other):
        obj=Samples()
144

145 146 147 148 149
        concatenate_with_padding(obj.d, 
                                 self.d, self.d['number'], 
                                 other.d, other.d['number'],
                                 ['number'])

150
        obj.d["number"] =  int(self.d["number"]) + int(other.d["number"])
Marc Duez's avatar
Marc Duez committed
151 152
        
        return obj
153

154 155 156
    def __str__(self):
        return "<Samples: %s>" % self.d

157 158
class Diversity: 

159 160
    keys = ["index_H_entropy", "index_E_equitability", "index_Ds_diversity"]

161 162 163
    def __init__(self, data=None):
        self.d={}

164 165 166 167 168
        for k in self.keys:
            if data == None or not (k in data):
                self.d[k] = ["na"]
            else:
                self.d[k]= [data[k]]
169

170 171 172
    def __add__(self, other):
        for k in self.keys:
            self.d[k].append(other.d[k][0])
173 174 175 176 177
        return self

    def __str__(self):
        return "<Diversity: %s>" % self.d
        
178 179 180 181 182 183 184
class Reads: 

    def __init__(self):
        self.d={}
        self.d["total"] = ["0"]
        self.d["segmented"] = ["0"]
        self.d["germline"] = {}
185
        self.d['distribution'] = {}
186 187 188

    def __add__(self, other):
        obj=Reads()
189 190 191 192 193

        concatenate_with_padding(obj.d['germline'], 
                                 self.d['germline'], len(self.d['total']),
                                 other.d['germline'], len(other.d['total']),
                                 ['total'])
194 195 196 197
        concatenate_with_padding(obj.d['distribution'],
                                 self.d['distribution'], len(self.d['total']),
                                 other.d['distribution'], len(other.d['total']),
                                 ['total'])
198 199 200 201
                
        obj.d["total"] = self.d["total"] + other.d["total"]
        obj.d["segmented"] = self.d["segmented"] + other.d["segmented"]
        return obj
202 203 204

    def __str__(self):
        return "<Reads: %s>" % self.d
205
        
206 207 208 209
class OtherWindows:
    
    """Aggregate counts of windows that are discarded (due to too small 'top') for each point into several 'others-' windows."""

210 211
    def __init__(self, length, ranges = None):
        self.ranges = ranges if ranges is not None else [1000, 100, 10, 1]
212 213 214 215 216 217 218 219 220 221
        self.length = length
        self.sizes = {}

        for r in self.ranges:
            self.sizes[r] = [0 for i in range(self.length)]


    def __iadd__(self, window):
        """Add to self a window. The different points may land into different 'others-' windows."""

222
        for i, s in enumerate(window.d["reads"]):
223 224 225 226 227
            for r in self.ranges:
                if s >= r:
                     break
            self.sizes[r][i] += s
            ## TODO: add seg_stat
228 229
            
        # print window, '-->', self.sizes
230 231 232 233 234 235 236
        return self

    def __iter__(self):
        """Yield the 'others-' windows"""

        for r in self.ranges:
            w = Window(self.length)
237
            w.d['reads'] = self.sizes[r]
238 239 240
            w.d['window'] = 'others-%d' % r
            w.d['top'] = 0

Mathieu Giraud's avatar
Mathieu Giraud committed
241
            print('  --[others]-->', w)
242
            yield w
243
        
244
class ListWindows(VidjilJson):
245 246 247
    '''storage class for sequences informations 
    
    >>> lw1.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
248
    <ListWindows: [25] 2>
249 250 251 252
    <window : [5] 3 aaa>
    <window : [12] 2 bbb>
    
    >>> lw2.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
253
    <ListWindows: [34] 2>
254 255 256
    <window : [8] 4 aaa>
    <window : [2] 8 ccc>
    
Mathieu Giraud's avatar
Mathieu Giraud committed
257
    >>> lw3 = lw1 + lw2
258
    >>> lw3.info()
Mathieu Giraud's avatar
Mathieu Giraud committed
259
    <ListWindows: [25, 34] 3>
260 261 262
    <window : [12, 0] 2 bbb>
    <window : [5, 8] 3 aaa>
    <window : [0, 2] 8 ccc>
263 264 265 266 267 268 269 270 271 272
    >>> lw3.build_stat()
    >>> lw3.d['reads'].d['distribution']['0.1']
    [17, 8]
    >>> lw3.d['reads'].d['distribution']['0.01']
    [0, 2]
    >>> lw3.d['reads'].d['distribution']['0.001']
    [0, 0]
    >>> lw3.d['reads'].d['distribution']['0.0001']
    [0, 0]

273 274 275 276 277
    '''
    
    def __init__(self):
        '''init ListWindows with the minimum data required'''
        self.d={}
278 279
        self.d["samples"] = Samples()
        self.d["reads"] = Reads()
280 281
        self.d["clones"] = []
        self.d["clusters"] = []
282
        self.d["germlines"] = {}
283
        
284 285 286 287
        self.d["vidjil_json_version"] = VIDJIL_JSON_VERSION
        self.d["producer"] = FUSE_VERSION
        self.d["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
288
    def __str__(self):
289 290 291 292 293 294 295 296 297 298 299 300 301 302
        return "<ListWindows: %s %d>" % ( self.d["reads"].d["segmented"], len(self) )

    # Iterator and access functions
    def __iter__(self):
        return self.d["clones"].__iter__()

    def __getitem__(self, i):
        ### not efficient !
        for clone in self:
            if clone.d["id"] == i:
                return clone

    def __len__(self):
        return len(self.d["clones"])
303 304 305

    ### print info about each Windows stored 
    def info(self):
Mathieu Giraud's avatar
Mathieu Giraud committed
306
        print(self)
307
        for clone in self:
Mathieu Giraud's avatar
Mathieu Giraud committed
308
            print(clone)
309
        
310 311
    ### compute statistics about clones
    def build_stat(self):
312
        ranges = [.1, .01, .001, .0001, .00001, .000001, .0000001]
313
        result = [[0 for col in range(len(self.d['reads'].d["segmented"]))] for row in range(len(ranges))]
314

315 316
        for clone in self:
            for i, s in enumerate(clone.d["reads"]):
317
                for r in range(len(ranges)):
318
                    if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
319 320 321 322
                        break 
                result[r][i] += s
                
        for r in range(len(ranges)):
323 324
            ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
            self.d['reads'].d['distribution'][ratio_in_string] = result[r]
325 326 327 328
            
        #TODO V/D/J distrib and more 
        
        
329 330 331
    ### save / load to .json
    def save_json(self, output):
        '''save ListWindows in .json format''' 
Mathieu Giraud's avatar
Mathieu Giraud committed
332
        print("==>", output)
333 334
        with open(output, "w") as f:
            json.dump(self, f, indent=2, default=self.toJson)
335

336
    def load(self, file_path, *args, **kwargs):
337 338 339 340
        if not '.clntab' in file_path:
            self.load_vidjil(file_path, *args, **kwargs)
        else:
            self.load_clntab(file_path, *args, **kwargs)
341 342 343 344 345 346

    def loads(self, string, *args, **kwargs):
        self.loads_vidjil(string, *args, **kwargs)

    def init_data(self, data):
        self.d=data.d
347 348 349
        # Be robust against 'null' values for clones
        if not self.d["clones"]:
            self.d["clones"] = []
350

351 352 353 354
        if "diversity" in self.d.keys():
            self.d["diversity"] = Diversity(self.d["diversity"])
        else:
            self.d["diversity"] = Diversity()
355
        
356 357
        if 'distribution' not in self.d['reads'].d:
            self.d['reads'].d['distribution'] = {}
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381

        self.id_lengths = defaultdict(int)

        print("%%")
        for clone in self:
            self.id_lengths[len(clone.d['id'])] += 1
        print ("%% lengths .vidjil -> ", self.id_lengths)
        try:
            print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
        except KeyError:
            pass

    def load_vidjil(self, file_path, pipeline, verbose=True):
        '''init listWindows with data file
        Detects and selects the parser according to the file extension.'''
        extension = file_path.split('.')[-1]

        if verbose:
            print("<==", file_path, "\t", end=' ')

        with open(file_path, "r") as f:
            self.init_data(json.load(f, object_hook=self.toPython))
            self.check_version(file_path)

382 383 384
        if pipeline: 
            # renaming, private pipeline
            f = '/'.join(file_path.split('/')[2:-1])
385
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
386
                print("[%s]" % f)
387 388 389

        else:
            f = file_path
390
            if verbose:
Mathieu Giraud's avatar
Mathieu Giraud committed
391
                print()
392

Marc Duez's avatar
Marc Duez committed
393
        time = os.path.getmtime(file_path)
394
        self.d["samples"].d["timestamp"] = [datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")]
395

396 397 398
    def loads_vidjil(self, string, pipeline, verbose=True):
        '''init listWindows with a json string'''
        self.init_data(json.loads(string, object_hook=self.toPython))
399

Marc Duez's avatar
Marc Duez committed
400 401 402
    def getTop(self, top):
        result = []
        
403 404 405
        for clone in self:
            if clone.d["top"] <= top :
                result.append(clone.d["id"])
Marc Duez's avatar
Marc Duez committed
406 407 408 409 410
        return result
        
    def filter(self, f):
        r = []
        
Marc Duez's avatar
Marc Duez committed
411
        reverseList = {}
412 413
        for i,w in enumerate(self.d["clones"]) :
            reverseList[w.d["id"]] = i
Marc Duez's avatar
Marc Duez committed
414 415 416
        
        #filteredList = { k: reverseList[k] for k in f }
        filteredList = dict(filter(lambda t: t[0] in f, reverseList.items()))
Marc Duez's avatar
Marc Duez committed
417
        
Marc Duez's avatar
Marc Duez committed
418
        for i in filteredList:
419
            r.append(self.d["clones"][filteredList[i]])
Marc Duez's avatar
Marc Duez committed
420
        
421
        self.d["clones"] = r
Marc Duez's avatar
Marc Duez committed
422
        
423 424 425 426
    ### 
    def __add__(self, other): 
        '''Combine two ListWindows into a unique ListWindows'''
        obj = ListWindows()
427 428
        l1 = len(self.d["reads"].d['segmented'])
        l2 = len(other.d["reads"].d['segmented'])
429

430 431 432
        concatenate_with_padding(obj.d, 
                                 self.d, l1,
                                 other.d, l2,
433 434
                                 ["clones", "links", "germlines",
                                  "vidjil_json_version"])
435
        
436
        obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
437 438
        obj.d["samples"] = self.d["samples"] + other.d["samples"]
        obj.d["reads"] = self.d["reads"] + other.d["reads"]
439
        obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
440
        
441 442
        return obj
        
443 444 445 446 447 448
    ###
    def __mul__(self, other):
        
        for i in range(len(self.d["reads_segmented"])):
            self.d["reads_segmented"][i] += other.d["reads_segmented"][i] 
        
449
        self.d["clones"] += other.d["clones"]
450 451 452 453 454 455
        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
        
        self.d["system_segmented"].update(other.d["system_segmented"])
        
        return self
        
456
    ### 
457
    def fuseWindows(self, w1, w2, l1, l2) :
458
        #store data in dict with "id" as key
459 460
        dico1 = {}
        for i in range(len(w1)) :
461
            dico1[ w1[i].d["id"] ] = w1[i]
462 463 464

        dico2 = {}
        for i in range(len(w2)) :
465
            dico2[ w2[i].d["id"] ] = w2[i]
466

467
        #concat data with the same key ("id")
468 469 470 471 472 473
        #if some data are missing we concat with an empty Windows()
        dico3 = {}
        for key in dico1 :
            if key in dico2 :
                dico3[key] = dico1[key] + dico2[key]
            else :
474
                w=Window(l2)
475 476 477 478
                dico3[key] = dico1[key] + w

        for key in dico2 :
            if key not in dico1 :
479
                w=Window(l1)
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
                dico3[key] = w + dico2[key]
        
        
        #sort by top
        tab = []
        for key in dico3 :
            tab.append((dico3[key], dico3[key].d["top"]))
        tab = sorted(tab, key=itemgetter(1))
        
        #store back in a List
        result=[]
        for i in range(len(tab)) :
            result.append(tab[i][0])
        
        return result
        
    
497 498
    def cut(self, limit, nb_points):
        '''Remove information from sequence/windows who never enter in the most represented sequences. Put this information in 'other' windows.'''
499 500

        w=[]
501 502 503

        others = OtherWindows(nb_points)

504 505 506
        for clone in self:
            if (int(clone.d["top"]) < limit or limit == 0) :
                w.append(clone)
507
            #else:
508
                #others += clone
509

510
        self.d["clones"] = w #+ list(others) 
511

Mathieu Giraud's avatar
Mathieu Giraud committed
512
        print("### Cut merged file, keeping window in the top %d for at least one point" % limit)
513 514
        return self
        
515
    def load_clntab(self, file_path, *args, **kwargs):
516 517 518
        '''Parser for .clntab file'''

        self.d["vidjil_json_version"] = [VIDJIL_JSON_VERSION]
519
        self.d["samples"].d["original_names"] = [file_path]
520
        self.d["samples"].d["producer"] = ["EC-NGS central pipeline"]
521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
        
        listw = []
        listc = []
        total_size = 0
        
        fichier = open(file_path,"r")
        for ligne in fichier:
            if "clonotype" in ligne:
                header_map = ligne.replace('\n', '').split('\t')
            else :
                tab = AccessedDict()
                for index, data in enumerate(ligne.split('\t')):
                    tab[header_map[index]] = data
                        
                w=Window(1)
536 537
                w.d["seg"] = {}
                
538
                #w.window=tab["sequence.seq id"] #use sequence id as window for .clntab data
539
                w.d["id"]=tab["sequence.raw nt seq"] #use sequence as window for .clntab data
540 541
                s = int(tab["sequence.size"])
                total_size += s
542
                w.d["reads"] = [ s ]
543

544 545
                w.d["germline"] = tab["sequence.V-GENE and allele"][:3] #system ...
                
546
                w.d["sequence"] = tab["sequence.raw nt seq"]
547
                w.d["seg"]["5"]=tab["sequence.V-GENE and allele"].split('=')[0]
548
                if (tab["sequence.D-GENE and allele"] != "") :
549 550
                    w.d["seg"]["4"]=tab["sequence.D-GENE and allele"].split('=')[0]
                w.d["seg"]["3"]=tab["sequence.J-GENE and allele"].split('=')[0]
551 552 553 554

                # use sequence.JUNCTION to colorize output (this is not exactly V/J !)
                junction = tab.get("sequence.JUNCTION.raw nt seq")
                position = w.d["sequence"].find(junction)
555
                
556
                if position >= 0:
557 558
                    w.d["seg"]["3start"] = position + len(junction)
                    w.d["seg"]["5end"] = position 
559
                else:
560 561 562 563 564 565
                    w.d["seg"]["3start"] = 0
                    w.d["seg"]["5end"] = len(w.d["sequence"])
                w.d["seg"]["3end"]=0
                w.d["seg"]["5start"]=0
                w.d["seg"]["4end"]=0
                w.d["seg"]["4start"]=0
566
                w.d["seg"]["cdr3"] = tab["sequence.JUNCTION.raw nt seq"][3:-3]
567
                    
568
                listw.append((w , w.d["reads"][0]))
569 570

                raw_clonotype = tab.get("clonotype")
571 572
                w.d["name"]=raw_clonotype # w.d["seg"]["5"] + " -x/y/-z " + w.d["seg"]["3"]

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
                clonotype = raw_clonotype.split(' ')
                if (len(clonotype) > 1) :
                    listc.append((w, raw_clonotype))
                
                #keep data that has not already been stored
                for header in tab.not_accessed_keys():
                    w.d["_"+header] = [tab[header]]

        #sort by sequence_size
        listw = sorted(listw, key=itemgetter(1), reverse=True)
        #sort by clonotype
        listc = sorted(listc, key=itemgetter(1))
        
        #generate data "top"
        for index in range(len(listw)):
            listw[index][0].d["top"]=index+1
589
            self.d["clones"].append(listw[index][0])
590
        
591 592
        self.d["reads"].d["segmented"] = [total_size]
        self.d["reads"].d["total"] = [total_size]
593 594 595 596

        
    def toJson(self, obj):
        '''Serializer for json module'''
597
        if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
598 599 600
            result = {}
            for key in obj.d :
                result[key]= obj.d[key]
Marc Duez's avatar
Marc Duez committed
601
                
602 603
            return result
            raise TypeError(repr(obj) + " fail !") 
604
        
Marc Duez's avatar
Marc Duez committed
605
        else:
606 607 608 609 610 611
            result = {}
            for key in obj :
                result[key]= obj[key]
            
            return result
            raise TypeError(repr(obj) + " fail !") 
612 613 614

    def toPython(self, obj_dict):
        '''Reverse serializer for json module'''
615
        if "samples" in obj_dict:
616
            obj = ListWindows()
617
            obj.d=obj_dict
618 619
            return obj

620
        if "id" in obj_dict:
621
            obj = Window(1)
622
            obj.d=obj_dict
623
            return obj
Marc Duez's avatar
Marc Duez committed
624
        
625 626
        if "total" in obj_dict:
            obj = Reads()
627
            obj.d=obj_dict
628 629
            return obj
            
Marc Duez's avatar
Marc Duez committed
630 631
        if "original_names" in obj_dict:
            obj = Samples()
632
            obj.d=obj_dict
633
            return obj
Marc Duez's avatar
Marc Duez committed
634
            
635
        return obj_dict
636 637 638 639 640 641
        



### some data used for test
w1 = Window(1)
642
w1.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
643
w2 = Window(1)
644
w2.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
645
w3 = Window(1)
646
w3.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
647
w4 = Window(1)
648
w4.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
649 650 651


w5 = Window(1)
652
w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
653
w6 = Window(1)
654
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
655 656

lw1 = ListWindows()
657
lw1.d["timestamp"] = 'ts'
658
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
659 660
lw1.d["clones"].append(w5)
lw1.d["clones"].append(w6)
661 662
lw1.d["diversity"] = Diversity()

663 664

w7 = Window(1)
665
w7.d ={"id" : "aaa", "reads" : [8], "top" : 4 }
666
w8 = Window(1)
667
w8.d ={"id" : "ccc", "reads" : [2], "top" : 8, "test" : ["plop"] }
668 669

lw2 = ListWindows()
670
lw2.d["timestamp"] = 'ts'
671
lw2.d["reads"] = json.loads('{"total": [40], "segmented": [34], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
672 673
lw2.d["clones"].append(w7)
lw2.d["clones"].append(w8)
674
lw2.d["diversity"] = Diversity()
675 676 677 678 679 680

    
    

 
def main():
Mathieu Giraud's avatar
Mathieu Giraud committed
681
    print("#", ' '.join(sys.argv))
682 683 684 685 686 687 688

    DESCRIPTION = 'Vidjil utility to parse and regroup list of clones of different timepoints or origins'
    
    #### Argument parser (argparse)

    parser = argparse.ArgumentParser(description= DESCRIPTION,
                                    epilog='''Example:
Mathieu Giraud's avatar
Mathieu Giraud committed
689
  python2 %(prog)s --germline IGH out/Diag/vidjil.data out/MRD-1/vidjil.data out/MRD-2/vidjil.data''',
690 691 692 693 694 695
                                    formatter_class=argparse.RawTextHelpFormatter)


    group_options = parser.add_argument_group() # title='Options and parameters')

    group_options.add_argument('--test', action='store_true', help='run self-tests')
696
    group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
697
    
698
    group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
699
    group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
700
    group_options.add_argument('--ijson', action='store_true', help='use the ijson vidjilparser')
701

Marc Duez's avatar
Marc Duez committed
702
    group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
703 704
    group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')

705
    group_options.add_argument('--first', '-f', type=int, default=0, help='take only into account the first FIRST files (0 for all) (%(default)s)')
706

707 708 709 710 711 712 713 714 715 716 717 718
    parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
  
    args = parser.parse_args()
    # print args

    if args.test:
        import doctest
        doctest.testmod(verbose = True)
        sys.exit(0)

    jlist_fused = None

Mathieu Giraud's avatar
Mathieu Giraud committed
719 720
    print("### fuse.py -- " + DESCRIPTION)
    print()
721

722 723 724 725 726 727
    files = args.file
    if args.first:
        if len(files) > args.first:
            print("! %d files were given. We take into account the first %d files." % (len(files), args.first))
        files = files[:args.first]

Marc Duez's avatar
Marc Duez committed
728 729
    #filtre
    f = []
730 731

    vparser = VidjilParser()
732
    vparser.addPrefix('clones.item', 'clones.item.top', le, args.top)
733
    for path_name in files:
734 735 736 737 738 739 740 741 742
        if args.ijson:
            json_clones = vparser.extract(path_name)
            clones = json.loads(json_clones)
            f += [c['id'] for c in clones["clones"]]
        else:
            jlist = ListWindows()
            jlist.load(path_name, args.pipeline)
            f += jlist.getTop(args.top)

Marc Duez's avatar
Marc Duez committed
743 744
    f = sorted(set(f))
    
745 746 747 748 749
    if args.ijson:
        vparser.reset()
        vparser.addPrefix('')
        vparser.addPrefix('clones.item', 'clones.item.id', (lambda x, y: x in y), f)

750
    if args.multi:
751
        for path_name in files:
752
            jlist = ListWindows()
753 754 755 756 757 758
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
759
            
Mathieu Giraud's avatar
Mathieu Giraud committed
760
            print("\t", jlist, end=' ')
761 762 763 764 765

            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused * jlist
766
                
Mathieu Giraud's avatar
Mathieu Giraud committed
767
            print('\t==> merge to', jlist_fused)
768
        jlist_fused.d["system_segmented"] = ordered(jlist_fused.d["system_segmented"], key=lambda sys: ((GERMLINES_ORDER + [sys]).index(sys), sys))
769
        
770
    else:
Mathieu Giraud's avatar
Mathieu Giraud committed
771
        print("### Read and merge input files")
772
        for path_name in files:
773
            jlist = ListWindows()
774 775 776 777 778 779 780
            if args.ijson:
                json_reads = vparser.extract(path_name)
                jlist.loads(json_reads, args.pipeline)
            else:
                jlist.load(path_name, args.pipeline)
                jlist.build_stat()
                jlist.filter(f)
Marc Duez's avatar
Marc Duez committed
781

782 783 784 785
            w1 = Window(1)
            w2 = Window(2)
            w3 = w1+w2
            
Mathieu Giraud's avatar
Mathieu Giraud committed
786
            print("\t", jlist, end=' ')
787 788 789 790 791
            # Merge lists
            if jlist_fused is None:
                jlist_fused = jlist
            else:
                jlist_fused = jlist_fused + jlist
792
            
Mathieu Giraud's avatar
Mathieu Giraud committed
793
            print('\t==> merge to', jlist_fused)
794

795
    if args.compress:
Mathieu Giraud's avatar
Mathieu Giraud committed
796 797
        print()
        print("### Select point names")
798
        l = jlist_fused.d["samples"].d["original_names"]
799
        ll = interesting_substrings(l)
Mathieu Giraud's avatar
Mathieu Giraud committed
800 801
        print("  <==", l)
        print("  ==>", ll)
802
        jlist_fused.d["samples"].d["names"] = ll
803
    
Mathieu Giraud's avatar
Mathieu Giraud committed
804
    print()
805
    if not args.multi:
806
        jlist_fused.cut(args.top, jlist_fused.d["samples"].d["number"])
Mathieu Giraud's avatar
Mathieu Giraud committed
807 808
    print("\t", jlist_fused) 
    print()
809

810
    #compute similarity matrix
811 812 813 814 815
    if len(jlist_fused.d["clones"]) < SIMILARITY_LIMIT :
        fasta = ""
        for i in range(len(jlist_fused.d["clones"])) :
            fasta += ">>" + str(i) + "\n"
            fasta += jlist_fused.d["clones"][i].d["id"] + "\n"
Ryan Herbert's avatar
Ryan Herbert committed
816
        fasta_file = tempfile.NamedTemporaryFile(mode="w", delete=False)
817 818 819 820 821 822
        fasta_file.write(fasta)
        try:
            out = subprocess.check_output([TOOL_SIMILARITY, "-j", fasta_file.name])
            jlist_fused.d["similarity"] = json.loads(out)
        except OSError:
            print("! failed: %s" % TOOL_SIMILARITY)
823 824
        finally:
            os.unlink(fasta_file.name)
825 826
    else : 
        jlist_fused.d["similarity"] = [];
827
        
Mathieu Giraud's avatar
Mathieu Giraud committed
828
    print("### Save merged file")
829
    jlist_fused.save_json(args.output)
830
    
831 832 833 834
    
    
if  __name__ =='__main__':
    main()