...
 
Commits (37)
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_1" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_2" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......
{
"categories": [{
"cat": "a_category",
"disease": "healthy"
}],
"clones": [ {
"_average_read_length": [
76.0
],
"_coverage": [
0.973684191703796
],
"_coverage_info": [
"74 bp (97% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAACAGAGTTACAGTACCCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV1-39*01 3//1 IGKJ2*01",
"reads": [
5653
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 37
},
"5": {
"delRight": 3,
"name": "IGKV1-39*01",
"stop": 36
},
"N": 0,
"affectSigns": {
"seq": "---------------------------- --------------------------",
"start": 1,
"stop": 74
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 74
},
"cdr3": {
"aa": "QQSYSTPYT",
"start": 17,
"stop": 43
},
"evalue": {
"val": "1.180765e-43"
},
"evalue_left": {
"val": "1.296443e-47"
},
"evalue_right": {
"val": "1.180635e-43"
},
"junction": {
"aa": "CQQSYSTPYTF",
"productive": true,
"start": 14,
"stop": 46
},
"quality": {
"seq": "!!!!!!!!!!IGIHIIIIIIIIIIIIIIIIIIIIIIIJIIIIIIIJJJJJJJJJJJJIII!!!!!!!!!!!!!!",
"start": 1,
"stop": 74
}
},
"seg_stat": {
"3": 5653
},
"sequence": "TGCAACTTACTACTGTCAACAGAGTTACAGTACCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC",
"top": 1
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAGCAGTATGGTAGCTCACCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV3-20*01 3//1 IGKJ2*01",
"reads": [
3898
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 35
},
"5": {
"delRight": 3,
"name": "IGKV3-20*01",
"stop": 34
},
"N": 0,
"affectSigns": {
"seq": " ---------------------------- ------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "____kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYGSSPYT",
"start": 15,
"stop": 41
},
"evalue": {
"val": "1.075621e-39"
},
"evalue_left": {
"val": "4.476181e-43"
},
"evalue_right": {
"val": "1.075173e-39"
},
"junction": {
"aa": "CQQYGSSPYTF",
"productive": true,
"start": 12,
"stop": 44
},
"quality": {
"seq": "!!!!!!!!IGIIIHGIHHIIIIIIIIJIIIIIIJIIJIJIIIIJJJIJJIJJIJJJII!!!!!!!!!!!!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 3898
},
"sequence": "CAGTGTATTACTGTCAGCAGTATGGTAGCTCACCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAAC",
"top": 2
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV1-33*01 3//1 IGKJ2*01",
"reads": [
2597
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 39
},
"5": {
"delRight": 3,
"name": "IGKV1-33*01",
"stop": 38
},
"N": 0,
"affectSigns": {
"seq": "---------------------------- ----------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYDNLPYT",
"start": 19,
"stop": 45
},
"evalue": {
"val": "2.592885e-47"
},
"evalue_left": {
"val": "1.296443e-47"
},
"evalue_right": {
"val": "1.296443e-47"
},
"junction": {
"aa": "CQQYDNLPYTF",
"productive": true,
"start": 16,
"stop": 48
},
"quality": {
"seq": "!!!!!!!!!!!!IIIIIIHHHIHIIJIIIIIIGHHIIIJJJJIJIIJIIJIJJJJJJJIJJJ!!!!!!!!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 2597
},
"sequence": "ATTGCAACATATTACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC",
"top": 3
},
{
"_average_read_length": [
76.0
],
"_coverage": [
0.973684191703796
],
"_coverage_info": [
"74 bp (97% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAGCAATATTATAGTACTCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV4-1*01 3//1 IGKJ2*01",
"reads": [
2520
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 35
},
"5": {
"delRight": 3,
"name": "IGKV4-1*01",
"stop": 34
},
"N": 0,
"affectSigns": {
"seq": " ---------------------------- ------------------------",
"start": 1,
"stop": 74
},
"affectValues": {
"seq": "__kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 74
},
"cdr3": {
"aa": "QQYYSTPYT",
"start": 15,
"stop": 41
},
"evalue": {
"val": "1.075178e-39"
},
"evalue_left": {
"val": "5.525986e-45"
},
"evalue_right": {
"val": "1.075173e-39"
},
"junction": {
"aa": "CQQYYSTPYTF",
"productive": true,
"start": 12,
"stop": 44
},
"quality": {
"seq": "!!!!!!!!IHIIIIHIIIJIIIIIIIIIHGIGIJJJIJJJIIJJJJIIJJIJIJJJII!!!!!!!!!!!!!!!!",
"start": 1,
"stop": 74
}
},
"seg_stat": {
"3": 2520
},
"sequence": "CAGTTTATTACTGTCAGCAATATTATAGTACTCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGA",
"top": 4
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TGTCAGCAGTATAATAACTGGCCTCCGCTCACTTTCGGCGGAGGGACCAA",
"name": "IGKV3-15*01 0//0 IGKJ4*01",
"reads": [
2502
],
"seg": {
"3": {
"delLeft": 0,
"name": "IGKJ4*01",
"start": 46
},
"5": {
"delRight": 0,
"name": "IGKV3-15*01",
"stop": 45
},
"N": 0,
"affectSigns": {
"seq": "--------------------- -----------------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYNNWPPLT",
"start": 23,
"stop": 52
},
"evalue": {
"val": "9.343769e-34"
},
"evalue_left": {
"val": "9.343769e-34"
},
"evalue_right": {
"val": "1.798806e-61"
},
"junction": {
"aa": "CQQYNNWPPLTF",
"productive": true,
"start": 20,
"stop": 55
},
"quality": {
"seq": "!!!!!!!!!!!!!!!!!!!HIHIIIIIIIJJIIHIIIIIIIIIIIJIIIIIIIIIJIIJJIIIIIIHIH!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 2502
},
"sequence": "AGATTTTGCAGTTTATTACTGTCAGCAGTATAATAACTGGCCTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAG",
"top": 5
}
],
"diversity": {
"index_Ds_diversity": 0.999757289886475,
"index_E_equitability": 0.710613250732422,
"index_H_entropy": 9.92569569544503
},
"germlines": {
"custom": {
"3": [],
"4": [],
"5": [],
"shortcut": "X"
},
"ref": "http://www.vidjil.org/germlines/germline-49.tar.gz",
"species": "Homo sapiens",
"species_taxon_id": 9606
},
"reads": {
"germline": {
"IGH": [
190947
],
"IGH+": [
78925
],
"IGK": [
657338
],
"IGK+": [
0
],
"IGL": [
235104
],
"TRA": [
0
],
"TRA+D": [
0
],
"TRB": [
2
],
"TRB+": [
0
],
"TRD": [
0
],
"TRD+": [
0
],
"TRG": [
0
],
"unexpected": [
2168
]
},
"segmented": [
1164484
],
"total": [
105515154
]
},
"samples": {
"commandline": [
"vidjil-algo -o result/ -c clones -3 -z 100 -r 1 -ggermline/homo-sapiens.g -e 1 -2 -d -w 50 /mnt/data/sequence_file.fastq.gz "
],
"log": [
" ==> junction detected in 1164484 reads (1.1%)\n ==> found 94208 windows in 1164484 reads (1.1% of 105515154 reads)\n ! There are not so many CDR3 windows found in this set of reads.\n ! Please check the unsegmentation causes below and refer to the documentation.\n reads av. len clones clo/rds\n IGH -> 190947 76.0 20065 0.105\n IGH+ -> 78925 76.0 10363 0.131\n IGK -> 657338 76.0 43921 0.067\n IGK+ -> 0 - 0 -\n IGL -> 235104 76.0 19478 0.083\n TRA -> 0 - 0 -\n TRA+D -> 0 - 0 -\n TRB -> 2 76.0 1 0.500\n TRB+ -> 0 - 0 -\n TRD -> 0 - 0 -\n TRD+ -> 0 - 0 -\n TRG -> 0 - 0 -\n unexpected -> 2168 76.0 380 0.175\n\n SEG -> 1164484 76.0\n SEG_+ -> 2858 76.0\n SEG_- -> 1161626 76.0\n SEG changed w -> 412847 76.0\n\n UNSEG too short -> 0 -\n UNSEG strand -> 395256 76.0\n UNSEG too few V/J -> 91046588 76.0\n UNSEG only V/5' -> 11147607 76.0\n UNSEG only J/3' -> 1760542 76.0\n UNSEG < delta_min -> 0 -\n UNSEG ambiguous -> 677 76.0\n UNSEG too short w -> 0 -\n"
],
"number": 1,
"original_names": [
"sequence_file"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2019-01-01 01:01:01"
]
},
"similarity": [
],
"vidjil_json_version": "2016b",
"warn": [
{
"code": "W20",
"msg": "Very few V(D)J recombinations found: 1.10%"
}
]
}
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
### fuse.py - Vidjil utility to parse and regroup list of clones of different timepoints or origins
# This file is part of Vidjil <http://www.vidjil.org>,
# High-throughput Analysis of V(D)J Immune Repertoire.
# Copyright (C) 2011-2017 by Bonsai bioinformatics
# at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
# Contributors:
# Marc Duez <marc.duez@vidjil.org>
# Mathieu Giraud <mathieu.giraud@vidjil.org>
# The Vidjil Team <contact@vidjil.org>
#
# "Vidjil" is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# "Vidjil" is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
from __future__ import print_function
import check_python_version
import sys
import json
import argparse
import os.path
import os
import datetime
import subprocess
import tempfile
from operator import itemgetter, le
from utils import *
from defs import *
from collections import defaultdict
from pipes import quote
# ==================
from fuse import *
# ==================
class Tree(defaultdict):
""" une classe qui permet de descendre sur de nombreux niveaux (defaultdict recursif)"""
def __init__(self, value=None):
super(Tree, self).__init__(Tree)
self.value = value
# On peut reprendre le Window de fuse, mais il faudra l'augmenter d'une fct get_value
# Cette fonction permettra pour un champs donne de retrouver la valeur specifique a ce clone
class Clone(Window):
# def __init__(self, data):
# """ un objet pour manipuler les champs d'un clone """
# return
def get_values(self, value):
""" une liste pour recuperer la valeur d'un champs du clone """
try:
if value == 'reagmt':
return clone. ...
if clone == "seg5":
return clone.d["seg"]["5"]["name"]
...
except:
return ""
return
def get_reads(self, t=-1):
""" une fonction direct """
if t== -1:
return self.d["reads"]
else
return self.d["reads"][t]
### Je pense que distribution devrait pouvoir être autonome, dans passer par fuse
#
class Distribution(VidjilJson):
""" on garde la fct originale """
def __init__(self, fileIn):
'''init ListWindows with the minimum data required'''
super(VidjilJson, self).__init__(VidjilJson)
self.distributions = {}
self.load(fileIn)
def cut_at_top(self, top):
if top != 0:
self.clones = self.clones[:top]
return
def load(self, file_path, verbose=True):
'''init listWindows with data file
Detects and selects the parser according to the file extension.'''
extension = file_path.split('.')[-1]
if verbose:
print("<==", file_path, "\t", end=' ')
with open(file_path, "r") as f:
self.d(json.load(f))
### On pourrait aussi imaginer prendre directement un jlist depuis fuse
return
def init_distrib(self, list_distrib):
""" on creer la liste des distributions avec des valeurs nulles pour le moment """
distributions = []
nb_sample = self.d["samples"].d["number"]...
for distrib in list_distrib:
### ICI le defaultdict devra être de la profondeur du nombre d'axes (soit la len de distrib)
distributions.append([{"axes":distrib,"values":[Tree([0, 0])] * nb_sample}])
return
def compute_distribution(self, list_distrib):
"""
list_distrib sera une listes des combinaisons d'axes a calculer
"""
# creer une liste de distribs
# ATTENTION, on peut avoir plusieurs fichier deja fuse
for filename in self.sample...:
self.distributions[filename] = self.init_distrib(list_distrib)
# passer la liste des clones en revue
for clone in self.clones:
for distrib_pos in range(self.distribs):
distrib = list_distrib[distrib_pos]
clone_values = clone.get_values( distrib.axes )
# list de valeur, pour les axes
self.add_clone_values( filename, distrib_pos, clone_values, clone["reads"])
return
def add_clone_values(self, filename, distrib_pos, values, nb_reads):
# ici on ajoute la valeur souhaité; peut-être de manière recursive au cas ou
self.distributions[filename][distrib_pos]...
i = 0
# Je ne pense pas que cette approche marche en python
obj = self.distributions[filename][distrib_pos]["values"]
while i != len(values):
obj = obj[values[i]]
i += 1
# incremente
# Pas certain que ça marche. A voir.
obj[0] += 1
obj[1] += nb_reads
return
if __name__ =='__main__':
DESCRIPTION = 'Vidjil utility to create distributions'
#### Argument parser (argparse)
parser = argparse.ArgumentParser(description= DESCRIPTION,
epilog='''Example:
python %(prog)s out/data1.vidjil out/data2.vidjil''',
formatter_class=argparse.RawTextHelpFormatter)
group_options = parser.add_argument_group() # title='Options and parameters')
group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')
parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
args = parser.parse_args()
print("### fuse.py -- %s\n" % DESCRIPTION)
LISTE_D = []
## pouvoir lire un fichier de paramètres ?
#envoyer ça par cli ?
LISTE_D.append(["reagmt", "lenCDR3"])
LISTE_D.append(["reagmt", "productive"])
LISTE_D.append(["reagmt", "seg5"])
LISTE_D.append(["reagmt", "seg4"])
LISTE_D.append(["reagmt", "seg3"])
LISTE_D.append(["seg5", "seg3"])
files = args.file
jlist = Distribution()
for path_name in files:
jlist.load(path_name, args.pipeline)
jlist.cut_at_top(args.top)
jlist.get_distrib(LISTE_D)
# Perrmettre ensuite de faire un export json ?
# Un seul esport pour tous les fichiers ? Ou bien un par fichier ?
# Les deux me semble logique.
# On peux imaginer que chaque fichier .vidjil soit accompagné d'un fichier de metadata et de distributions qui lui soit propre.
#
......@@ -71,6 +71,22 @@ class Window:
>>> (w2 + w4).d["test"]
[0, 'plop']
>>> w1.get_values("name")
'not_segmented'
>>> w1.get_values("top")
3
>>> w7 = Window(1)
>>> w7.d = seg_w7
>>> w7.d["seg"]["5"]["name"]
'TRAV17*01'
>>> w7.get_values("name")
'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
>>> w7.get_values("seg5")
'TRAV17*01'
'''
### init Window with the minimum data required
......@@ -123,6 +139,171 @@ class Window:
def get_nb_reads(self, cid, point=0):
return self[cid]["reads"][point]
def get_reads(self, t=-1):
if t== -1:
return self.d["reads"]
else:
return self.d["reads"][t]
def get_axes_values(self, axes):
""" une liste pour recuperer la valeur d'un champs du clone """
values = []
for axe in axes:
values.append(self.get_values(axe))
return values
def get_values(self, axe):
try:
if axe == "top":
return self.d["top"]
if axe == "germline" or axe == "reagmt":
return self.d["germline"]
# all other values need seg
try:
self.d["seg"]
except:
return "not_segmented"
if axe == "name":
return self.d["name"]
if axe == "seg5":
return self.d["seg"]["5"]["name"]
if axe == "seg4":
return self.d["seg"]["4"]["name"]
if axe == "seg4a":
return self.d["seg"]["4a"]["name"]
if axe == "seg4b":
return self.d["seg"]["4b"]["name"]
if axe == "seg3":
return self.d["seg"]["3"]["name"]
if axe == "lenSeq":
return len(self.d["sequence"])
if axe == "evalue":
return self.d["evalue"]["val"]
if axe == "seg5_delRight":
return self.d["seg"]["5"]["delRight"]
if axe == "seg3_delLeft":
return self.d["seg"]["3"]["delLeft"]
if axe == "seg4_delRight":
return self.d["seg"]["4"]["delRight"]
if axe == "seg3_delLeft":
return self.d["seg"]["4"]["delLeft"]
### length
if axe == "insert_53":
return self.get_values("seg3_start") - self.get_values("seg5_stop") -1
if axe == "insert_54":
return self.get_values("seg4_start") - self.get_values("seg5_stop") -1
if axe == "insert_43":
return self.get_values("seg3_start") - self.get_values("seg4_stop") -1
if axe == "lenCDR3":
return self.get_values("cdr3_stop") - self.get_values("cdr3_start")
if axe == "lenJunction":
return self.get_values("junction_stop") - self.get_values("junction_start")
if axe == "GCContent":
return self.getGCcontent()
# FIXME: need to get the rigth position value
# If the clone have multiple sample, need to select the current.
# Should not apply if executed from the vidjil server
if axe == "lenSeqConsensus":
return self.d["_average_read_length"][0]/ self.get_values("lenSeq")
if axe == "lenSeqAverage":
return self.d["_average_read_length"][0]
if axe == "coverage":
return round(self.d["_coverage"][0]*100, 2)
### Positions
if axe == "seg5_stop":
return self.d["seg"]["5"]["stop"]
if axe == "seg3_start":
return self.d["seg"]["3"]["start"]
if axe == "seg4_stop":
return self.d["seg"]["4"]["stop"]
if axe == "seg4_start":
return self.d["seg"]["4"]["start"]
if axe == "cdr3_stop":
return self.d["seg"]["cdr3"]["stop"]
if axe == "cdr3_start":
return self.d["seg"]["cdr3"]["start"]
if axe == "junction_stop":
return self.d["seg"]["junction"]["stop"]
if axe == "junction_start":
return self.d["seg"]["junction"]["start"]
if axe == "productive":
return self.d["seg"]["junction"]["productive"]
if axe == "complete":
germline = self.get_values("germline")
if "+" in germline:
return False
elif germline in ["unexpected", "IKZF1", "ERG"]:
# maybe more specific later ?
return "unexpected"
elif germline in ["IGH","IGK","IGL","TRA","TRB","TRD","TRG"]:
# List can be set if new complete germline is defined
return True
else:
return "unexpected"
if axe == "rearangment":
return self.get_reagmt()
return "todo"
except:
return "?"
def getGCcontent(self):
if len(self.d["sequence"]) == 0:
return "?"
gc = 0
for nt in self.d["sequence"]:
if nt in "GgCc":
gc += 1
GCContent = gc / len(self.d["sequence"])
# return round for better association of clones
return round( GCContent, 3)
def get_reagmt( self):
""" hard coded function to define the rearangement type of this clone """
complete = self.get_values("complete")
seg_vals = {
"5": self.get_values("seg5"),
"4a": self.get_values("seg4a"),
"4": self.get_values("seg4"),
"4b": self.get_values("seg4b"),
"3": self.get_values("seg3")
}
if complete == True:
if seg_vals["4a"] == "?" and seg_vals["4"] == "?" and seg_vals["4b"] == "?":
return "VJ"
elif seg_vals["4a"] == "?" and seg_vals["4b"] == "?":
return "VDJ"
elif seg_vals["4a"] == "?" or seg_vals["4b"] == "?":
return "VDDJ"
else:
return "VDDDJ"
elif complete == False:
# cas compliqué, DJ etant un 53, DD aussi et VD aussi
if "V" in seg_vals["5"] and "D" in seg_vals["3"]:
return "VD"
elif "D" in seg_vals["5"] and "J" in seg_vals["3"]:
return "DJ"
elif "D" in seg_vals["5"] and "D" in seg_vals["3"]:
return "DD"
elif complete == "unexpected":
return "unexpected"
else:
return "?"
def latex(self, point=0, base_germline=10000, base=10000, tag=''):
reads = self.d["reads"][point]
ratio_germline = float(reads)/base_germline
......@@ -134,7 +315,7 @@ class Window:
### print essential info about Window
def __str__(self):
return "<window : %s %s %s>" % ( self.d["reads"], '*' if self.d["top"] == sys.maxsize else self.d["top"], self.d["id"])
class Samples:
def __init__(self):
......@@ -243,7 +424,61 @@ class OtherWindows:
print(' --[others]-->', w)
yield w
class Categories:
''' Object to manipulate categories inside vidjil files '''
def __init__(self, categories=[], filename=[]):
if len(categories) != len(filename):
print("###### Attention, categories et filename n'ont pas la meme taille [%s]" % type(self))
self.d = categories
self.fname = list(set(filename))
def __add__(self, other):
obj = self
for cats in other.d:
self.d.append(cats)
for fname in other.fname:
self.fname.append(fname)
return obj
def get_set_cats(self):
# faire la liste de toutes les clefs
list_cats_keys = []
for filepos in range(len(self.d)):
for key in self.get_cats_of_file(filepos).keys():
list_cats_keys.append( key )
return set( list_cats_keys )
def get_aggreg(self):
filelist = self.fname
if filelist == [] or len(self.d) != len(self.fname) :
filelist = range(len(self.d))
categories = defaultdict(lambda: defaultdict(lambda:[]))
list_cats = self.get_set_cats()
for filepos in range(len(self.d)):
filename = filelist[filepos]
for cat in list_cats:
categories[cat][self.get_value(cat, filepos)].append(filename)
return categories
def get_cats_of_file(self, filepos):
return self.d[filepos]
def get_value(self, category, filepos):
cats = self.get_cats_of_file(filepos)
try:
return cats[category]
except:
return "undefined"
def __str__(self):
return "%s: %s" % (str(self.fname), str(self.d))
class ListWindows(VidjilJson):
'''storage class for sequences informations
......@@ -278,8 +513,10 @@ class ListWindows(VidjilJson):
def __init__(self):
'''init ListWindows with the minimum data required'''
self.d={}
self.d["samples"] = Samples()
self.d["reads"] = Reads()
self.d["samples"] = Samples()
self.d["reads"] = Reads()
self.d["categories"] = Categories()
self.d["clones"] = []
self.d["clusters"] = []
self.d["germlines"] = {}
......@@ -360,10 +597,18 @@ class ListWindows(VidjilJson):
self.d['reads'].d['distribution'] = {}
self.id_lengths = defaultdict(int)
try:
data.d["categories"]
except:
data.d["categories"] = [{}]
self.d["categories"] = Categories(data.d["categories"], data.d["samples"].d["original_names"])
print("%%")
for clone in self:
self.id_lengths[len(clone.d['id'])] += 1
print ("%% lengths .vidjil -> ", self.id_lengths)
try:
print("%% run_v ->", self.d["samples"].d["producer"], self.d["samples"].d["run_timestamp"])
......@@ -376,7 +621,7 @@ class ListWindows(VidjilJson):
extension = file_path.split('.')[-1]
if verbose:
print("<==", file_path, "\t", end=' ')
print("<==", file_path, "\t")
with open(file_path, "r") as f:
self.init_data(json.load(f, object_hook=self.toPython))
......@@ -436,11 +681,32 @@ class ListWindows(VidjilJson):
["clones", "links", "germlines",
"vidjil_json_version"])
obj.d["clones"]=self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
obj.d["samples"] = self.d["samples"] + other.d["samples"]
obj.d["reads"] = self.d["reads"] + other.d["reads"]
obj.d["clones"] = self.fuseWindows(self.d["clones"], other.d["clones"], l1, l2)
obj.d["samples"] = self.d["samples"] + other.d["samples"]
obj.d["reads"] = self.d["reads"] + other.d["reads"]
obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
obj.d["categories"] = self.d["categories"] + other.d["categories"]
try:
# verify that same file is not present twice
filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
for filename in filename_jlist1:
if filename in filename_jlist2:
raise( "Error, duplicate file name (in distributions) ")
obj.d["distributions"] = {}
obj.d["distributions"]["repertoires"] = {}
obj.d["distributions"]["keys"] = ["clones", "reads"]
obj.d["distributions"]["filters"] = {}
obj.d["distributions"]["categories"] = {}
for filename in filename_jlist1:
obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
for filename in filename_jlist2:
obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
except:
pass
return obj
###
......@@ -599,12 +865,15 @@ class ListWindows(VidjilJson):
'''Serializer for json module'''
if isinstance(obj, ListWindows) or isinstance(obj, Window) or isinstance(obj, Samples) or isinstance(obj, Reads) or isinstance(obj, Diversity):
result = {}
for key in obj.d :
result[key]= obj.d[key]
for key in obj.d:
result[key] = obj.d[key]
return result
raise TypeError(repr(obj) + " fail !")
elif isinstance(obj, Categories):
return obj.d
else:
result = {}
for key in obj :
......@@ -637,6 +906,75 @@ class ListWindows(VidjilJson):
return obj_dict
def get_filename_pos(self, filename):
""" filename is the key of distributions repertoires """
return self.d["samples"].d["original_names"].index(filename)
# ========================= #
# Distributions computing #
# ========================= #
def init_distrib(self, list_distrib):
""" Create distributions structures """
self.d["distributions"] = {}
self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
self.d["distributions"]["keys"] = ["clones", "reads"]
self.d["distributions"]["filters"] = []
self.d["distributions"]["categories"] = self.d["categories"].get_aggreg()
# Warning, will fail of there are same original_names; fixme
nb_sample = self.d["samples"].d["number"]
# prefill distribution for each file of jlist
for filename in self.d["samples"].d["original_names"]:
for distrib in list_distrib:
self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
return
def compute_distribution(self, list_distrib):
""" list_distrib is a list of distrib to compute """
for filename in self.d["samples"].d["original_names"]:
for clone in self.d["clones"]:
for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
distrib = list_distrib[distrib_pos]
clone_values = clone.get_axes_values( distrib )
nb_reads = clone.get_reads(self.get_filename_pos(filename))
self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
return
def add_clone_values(self, filename, distrib_pos, values, nb_reads):
""" Add value of a clone to a specific distribution """
obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
return
def recursive_add( self, obj, values, nb_reads):
""" Add by recusivity the newer value to existing value; return the obj """
if len(values) > 1:
# Should increase for one depth
nvalues = values[1:]
obj[values[0]] = self.recursive_add(obj[values[0]], nvalues, nb_reads)
else:
# last depth
if not obj:
obj = defaultdict(lambda: False)
if not obj[values[0]]:
obj[values[0]] = [0, 0]
obj[values[0]][0] += 1
obj[values[0]][1] += nb_reads
return obj
def save_distributions(self, foname):
# Compute aggreg just before export
self.d["distributions"]["categories"] = self.d["categories"].get_aggreg()
fo = open(foname, "w")
json.dump(self.d["distributions"], fo, indent=2)
fo.close()
return
......@@ -656,6 +994,35 @@ w5.d ={"id" : "aaa", "reads" : [5], "top" : 3 }
w6 = Window(1)
w6.d ={"id" : "bbb", "reads" : [12], "top" : 2 }
seg_w7 = {
"germline": "TRA",
"id": "TTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACAT",
"name": "TRAV17*01 0/CCGGGG/5 TRAJ40*01",
"reads": [
16
],
"seg": {
"3": {
"delLeft": 5,
"name": "TRAJ40*01",
"start": 112
},
"5": {
"delRight": 0,
"name": "TRAV17*01",
"stop": 105
},
"N": 6,
"cdr3": {
"aa": "ATDAG#SGTYKYI",
"start": 96,
"stop": 133
}
},
"sequence": "GTGGAAGATTAAGAGTCACGCTTGACACTTCCAAGAAAAGCAGTTCCTTGTTGATCACGGCTTCCCGGGCAGCAGACACTGCTTCTTACTTCTGTGCTACGGACGCCGGGGCTCAGGAACCTACAAATACATCTTTGGAACAG",
"top": 26
}
lw1 = ListWindows()
lw1.d["timestamp"] = 'ts'
lw1.d["reads"] = json.loads('{"total": [30], "segmented": [25], "germline": {}, "distribution": {}}', object_hook=lw1.toPython)
......@@ -715,6 +1082,8 @@ def main():
group_options.add_argument('--test', action='store_true', help='run self-tests')
group_options.add_argument('--multi', action='store_true', help='merge different systems from a same timepoint (deprecated, do not use)')
group_options.add_argument('--distributions', '-d', action='store_true', help='compute distributions')
group_options.add_argument('--only_disributions', '-D', action='store_true', help='export only distributions')
group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
......@@ -739,6 +1108,24 @@ def main():
jlist_fused = None
LISTE_D = []
LIST_AXES = ["germline", # "top", # "name"
"seg5", "seg4", "seg3",
"lenSeqConsensus", "lenSeqAverage", "GCContent", "coverage",
"lenSeq", # "evalue", l'arrondir ?
"seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg3_delLeft",
"insert_53", "insert_54", "insert_43",
#"seg5_stop", "seg3_start", "seg4_stop", "seg4_start",
"lenCDR3", # "cdr3_stop", "cdr3_start",
"productive", #"junction_start", "junction_stop",
"rearangment", "complete",
]
for axe1 in LIST_AXES:
LISTE_D.append([axe1])
for axe2 in LIST_AXES:
if axe1 != axe2:
LISTE_D.append([axe1, axe2])
print("### fuse.py -- " + DESCRIPTION)
print()
......@@ -774,7 +1161,6 @@ def main():
jlist = ListWindows()
jlist.load(path_name, args.pipeline)
f += jlist.getTop(args.top)
f = sorted(set(f))
if args.ijson:
......@@ -809,8 +1195,14 @@ def main():
if args.ijson:
json_reads = vparser.extract(path_name)
jlist.loads(json_reads, args.pipeline)
if args.distributions or args.only_disributions:
jlist.init_distrib(LISTE_D)
jlist.compute_distribution(LISTE_D)
else:
jlist.load(path_name, args.pipeline)
if args.distributions or args.only_disributions:
jlist.init_distrib(LISTE_D)
jlist.compute_distribution(LISTE_D)
jlist.build_stat()
jlist.filter(f)
......@@ -859,6 +1251,12 @@ def main():
os.unlink(fasta_file.name)
else :
jlist_fused.d["similarity"] = [];
if args.only_disributions:
print("### Save distributions file")
jlist_fused.save_distributions(args.output.replace(".vidjil", ".json"))
return
print("### Save merged file")
jlist_fused.save_json(args.output)
......
### Without distributions computing
python3 ../../fuse.py ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-3.vidjil; cat fused.vidjil
$ Get correct file name in original_names field
1:"/some/file_1"
1:"/some/file_2"
$ should not have distribution if not asked
0:distributions
$ categories should be present one time ... ; even if no vidjil file already have it
1:categories
$ ... with empty content (2 for categories +1 for germlines)
3:{}
### With distributions computing
rm fused.vidjil
python3 ../../fuse.py -d ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-3.vidjil; cat fused.vidjil
$ Get correct file name in original_names field and distributions
2:"/some/file_1"
2:"/some/file_2"
$ should not have distribution if not asked
1:distributions
$ categories should be present 2 times; in vidjil top level, and in distribution["categories"]
2:categories
$ Axes title should be present 62 times (2*31)
78:rearangment
\ No newline at end of file
###################################
### Part with unsegmented clones ##
###################################
# python3 ../../fuse.py -D ../../../algo/tests/data/nico_very_short.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil; cat fused.json
python3 ../../fuse.py -D ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-3.vidjil; cat fused.json
$ Get correct keys for distributions json content
1:"repertoires"
1:"keys"
1:"categories"
1:"filters"
$ Get correct files names
r:/some/file_[12]
# For the moment, 31 call by repertoire
$ Have correct number of entrie for seg5
78:"seg5"
$ Have correct number of entrie for germline
78:germline
############################
### Part with real clones ##
############################
rm fused.json
python3 ../../fuse.py -D ../../../algo/tests/data/results_five_segmented_clones.vidjil ../../../algo/tests/data/results-two-clones-1-2.vidjil ../../../algo/tests/data/results-two-clones-1-3.vidjil; cat fused.json
$ Get correct keys for distributions json content
1:"repertoires"
1:"keys"
1:"categories"
1:"filters"
# For the moment, 31 call by repertoire
$ Have correct number of entrie for seg5 (increase of 31 by the new repertoire)
117:"seg5"
$ Get correct files names
:sequence_file
$ Test on categories -undefined
2:undefined
$ Test on categories - "cat"
1:"cat"
$ Test on categories - disease
1:disease
$ Test on categories - a_category
1:a_category
$ EAch filename should appear 3 times (1 for repname, 1 for both category)
3:"sequence_file"
3:"/some/file_1"
3:"/some/file_2"
\ No newline at end of file