Commit 38756b55 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-t/3944-pouvoir-calculer-des-distributions-2' into 'dev'

Feature t/3944 pouvoir calculer des distributions 2

Closes #3944

See merge request !504
parents f7cac1de eea5d6c4
Pipeline #93618 failed with stages
in 18 minutes and 10 seconds
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_1" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_2" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......
{
"categories": [{
"cat": "a_category",
"disease": "healthy"
}],
"clones": [ {
"_average_read_length": [
76.0
],
"_coverage": [
0.973684191703796
],
"_coverage_info": [
"74 bp (97% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAACAGAGTTACAGTACCCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV1-39*01 3//1 IGKJ2*01",
"reads": [
5653
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 37
},
"5": {
"delRight": 3,
"name": "IGKV1-39*01",
"stop": 36
},
"N": 0,
"affectSigns": {
"seq": "---------------------------- --------------------------",
"start": 1,
"stop": 74
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 74
},
"cdr3": {
"aa": "QQSYSTPYT",
"start": 17,
"stop": 43
},
"evalue": {
"val": "1.180765e-43"
},
"evalue_left": {
"val": "1.296443e-47"
},
"evalue_right": {
"val": "1.180635e-43"
},
"junction": {
"aa": "CQQSYSTPYTF",
"productive": true,
"start": 14,
"stop": 46
},
"quality": {
"seq": "!!!!!!!!!!IGIHIIIIIIIIIIIIIIIIIIIIIIIJIIIIIIIJJJJJJJJJJJJIII!!!!!!!!!!!!!!",
"start": 1,
"stop": 74
}
},
"seg_stat": {
"3": 5653
},
"sequence": "TGCAACTTACTACTGTCAACAGAGTTACAGTACCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC",
"top": 1
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAGCAGTATGGTAGCTCACCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV3-20*01 3//1 IGKJ2*01",
"reads": [
3898
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 35
},
"5": {
"delRight": 3,
"name": "IGKV3-20*01",
"stop": 34
},
"N": 0,
"affectSigns": {
"seq": " ---------------------------- ------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "____kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYGSSPYT",
"start": 15,
"stop": 41
},
"evalue": {
"val": "1.075621e-39"
},
"evalue_left": {
"val": "4.476181e-43"
},
"evalue_right": {
"val": "1.075173e-39"
},
"junction": {
"aa": "CQQYGSSPYTF",
"productive": true,
"start": 12,
"stop": 44
},
"quality": {
"seq": "!!!!!!!!IGIIIHGIHHIIIIIIIIJIIIIIIJIIJIJIIIIJJJIJJIJJIJJJII!!!!!!!!!!!!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 3898
},
"sequence": "CAGTGTATTACTGTCAGCAGTATGGTAGCTCACCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAAC",
"top": 2
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV1-33*01 3//1 IGKJ2*01",
"reads": [
2597
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 39
},
"5": {
"delRight": 3,
"name": "IGKV1-33*01",
"stop": 38
},
"N": 0,
"affectSigns": {
"seq": "---------------------------- ----------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYDNLPYT",
"start": 19,
"stop": 45
},
"evalue": {
"val": "2.592885e-47"
},
"evalue_left": {
"val": "1.296443e-47"
},
"evalue_right": {
"val": "1.296443e-47"
},
"junction": {
"aa": "CQQYDNLPYTF",
"productive": true,
"start": 16,
"stop": 48
},
"quality": {
"seq": "!!!!!!!!!!!!IIIIIIHHHIHIIJIIIIIIGHHIIIJJJJIJIIJIIJIJJJJJJJIJJJ!!!!!!!!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 2597
},
"sequence": "ATTGCAACATATTACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC",
"top": 3
},
{
"_average_read_length": [
76.0
],
"_coverage": [
0.973684191703796
],
"_coverage_info": [
"74 bp (97% of 76.0 bp)"
],
"germline": "IGK",
"id": "TACTGTCAGCAATATTATAGTACTCCGTACACTTTTGGCCAGGGGACCAA",
"name": "IGKV4-1*01 3//1 IGKJ2*01",
"reads": [
2520
],
"seg": {
"3": {
"delLeft": 1,
"name": "IGKJ2*01",
"start": 35
},
"5": {
"delRight": 3,
"name": "IGKV4-1*01",
"stop": 34
},
"N": 0,
"affectSigns": {
"seq": " ---------------------------- ------------------------",
"start": 1,
"stop": 74
},
"affectValues": {
"seq": "__kkkkkkkkkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 74
},
"cdr3": {
"aa": "QQYYSTPYT",
"start": 15,
"stop": 41
},
"evalue": {
"val": "1.075178e-39"
},
"evalue_left": {
"val": "5.525986e-45"
},
"evalue_right": {
"val": "1.075173e-39"
},
"junction": {
"aa": "CQQYYSTPYTF",
"productive": true,
"start": 12,
"stop": 44
},
"quality": {
"seq": "!!!!!!!!IHIIIIHIIIJIIIIIIIIIHGIGIJJJIJJJIIJJJJIIJJIJIJJJII!!!!!!!!!!!!!!!!",
"start": 1,
"stop": 74
}
},
"seg_stat": {
"3": 2520
},
"sequence": "CAGTTTATTACTGTCAGCAATATTATAGTACTCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGA",
"top": 4
},
{
"_average_read_length": [
76.0
],
"_coverage": [
1.0
],
"_coverage_info": [
"76 bp (100% of 76.0 bp)"
],
"germline": "IGK",
"id": "TGTCAGCAGTATAATAACTGGCCTCCGCTCACTTTCGGCGGAGGGACCAA",
"name": "IGKV3-15*01 0//0 IGKJ4*01",
"reads": [
2502
],
"seg": {
"3": {
"delLeft": 0,
"name": "IGKJ4*01",
"start": 46
},
"5": {
"delRight": 0,
"name": "IGKV3-15*01",
"stop": 45
},
"N": 0,
"affectSigns": {
"seq": "--------------------- -----------------------------------",
"start": 1,
"stop": 76
},
"affectValues": {
"seq": "kkkkkkkkkkkkkkkkkkkkk__________KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK",
"start": 1,
"stop": 76
},
"cdr3": {
"aa": "QQYNNWPPLT",
"start": 23,
"stop": 52
},
"evalue": {
"val": "9.343769e-34"
},
"evalue_left": {
"val": "9.343769e-34"
},
"evalue_right": {
"val": "1.798806e-61"
},
"junction": {
"aa": "CQQYNNWPPLTF",
"productive": true,
"start": 20,
"stop": 55
},
"quality": {
"seq": "!!!!!!!!!!!!!!!!!!!HIHIIIIIIIJJIIHIIIIIIIIIIIJIIIIIIIIIJIIJJIIIIIIHIH!!!!!!!",
"start": 1,
"stop": 76
}
},
"seg_stat": {
"3": 2502
},
"sequence": "AGATTTTGCAGTTTATTACTGTCAGCAGTATAATAACTGGCCTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAG",
"top": 5
}
],
"diversity": {
"index_Ds_diversity": 0.999757289886475,
"index_E_equitability": 0.710613250732422,
"index_H_entropy": 9.92569569544503
},
"germlines": {
"custom": {
"3": [],
"4": [],
"5": [],
"shortcut": "X"
},
"ref": "http://www.vidjil.org/germlines/germline-49.tar.gz",
"species": "Homo sapiens",
"species_taxon_id": 9606
},
"reads": {
"germline": {
"IGH": [
190947
],
"IGH+": [
78925
],
"IGK": [
657338
],
"IGK+": [
0
],
"IGL": [
235104
],
"TRA": [
0
],
"TRA+D": [
0
],
"TRB": [
2
],
"TRB+": [
0
],
"TRD": [
0
],
"TRD+": [
0
],
"TRG": [
0
],
"unexpected": [
2168
]
},
"segmented": [
1164484
],
"total": [
105515154
]
},
"samples": {
"commandline": [
"vidjil-algo -o result/ -c clones -3 -z 100 -r 1 -ggermline/homo-sapiens.g -e 1 -2 -d -w 50 /mnt/data/sequence_file.fastq.gz "
],
"log": [
" ==> junction detected in 1164484 reads (1.1%)\n ==> found 94208 windows in 1164484 reads (1.1% of 105515154 reads)\n ! There are not so many CDR3 windows found in this set of reads.\n ! Please check the unsegmentation causes below and refer to the documentation.\n reads av. len clones clo/rds\n IGH -> 190947 76.0 20065 0.105\n IGH+ -> 78925 76.0 10363 0.131\n IGK -> 657338 76.0 43921 0.067\n IGK+ -> 0 - 0 -\n IGL -> 235104 76.0 19478 0.083\n TRA -> 0 - 0 -\n TRA+D -> 0 - 0 -\n TRB -> 2 76.0 1 0.500\n TRB+ -> 0 - 0 -\n TRD -> 0 - 0 -\n TRD+ -> 0 - 0 -\n TRG -> 0 - 0 -\n unexpected -> 2168 76.0 380 0.175\n\n SEG -> 1164484 76.0\n SEG_+ -> 2858 76.0\n SEG_- -> 1161626 76.0\n SEG changed w -> 412847 76.0\n\n UNSEG too short -> 0 -\n UNSEG strand -> 395256 76.0\n UNSEG too few V/J -> 91046588 76.0\n UNSEG only V/5' -> 11147607 76.0\n UNSEG only J/3' -> 1760542 76.0\n UNSEG < delta_min -> 0 -\n UNSEG ambiguous -> 677 76.0\n UNSEG too short w -> 0 -\n"
],
"number": 1,
"original_names": [
"sequence_file"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2019-01-01 01:01:01"
]
},
"similarity": [
],
"vidjil_json_version": "2016b",
"warn": [
{
"code": "W20",
"msg": "Very few V(D)J recombinations found: 1.10%"
}
]
}
......@@ -49,7 +49,17 @@ SIMILARITY_LIMIT = 100
GERMLINES_ORDER = ['TRA', 'TRB', 'TRG', 'TRD', 'DD', 'IGH', 'DHJH', 'IJK', 'IJL']
####
AVAILABLE_AXES = [
"top", "germline", "name",
"seg5", "seg4", "seg4a", "seg4b", "seg3",
"evalue", "lenSeqAverage", "lenCDR3", "lenJunction",
"seg5_delRight", "seg3_delLeft", "seg4_delRight", "seg4_delLeft",
"seg5_stop", "seg3_start", "seg4_stop", "seg4_start", "cdr3_stop", "cdr3_start",
"junction_stop", "junction_start", "productive",
"insert_53", "insert_54", "insert_43",
"evalue", "evalue_left", "evalue_right",
]
class Window:
# Should be renamed "Clone"
......@@ -71,8 +81,73 @@ class Window:
>>> (w2 + w4).d["test"]
[0, 'plop']
'''
>>> w1.get_values("name")
'?'
>>> w1.get_values("top")
3
>>> w7 = Window(1)
>>> w7.d = seg_w7
>>> w7.d["seg"]["5"]["name"]
'TRAV17*01'
>>> w7.get_values("reads")
16
>>> w7.get_values("top")
26
>>> w7.get_values("germline")
'TRA'
>>> w7.get_values("name")
'TRAV17*01 0/CCGGGG/5 TRAJ40*01'
>>> w7.get_values("seg5")
'TRAV17*01'
>>> w7.get_values("seg4a")
'?'
>>> w7.get_values("seg4")
'?'
>>> w7.get_values("seg4b")
'?'
>>> w7.get_values("seg3")
'TRAJ40*01'
>>> w7.get_values("evalue")
'1.180765e-43'
>>> w7.get_values("evalue_left")
'1.296443e-47'
>>> w7.get_values("evalue_right")
'1.180635e-43'
>>> w7.get_values("seg5_delRight")
0
>>> w7.get_values("seg3_delLeft")
5
>>> w7.get_values("seg4_delRight")
'?'
>>> w7.get_values("seg4_delLeft")
'?'
>>> w7.get_values("seg5_stop")
105
>>> w7.get_values("seg3_start")
112
>>> w7.get_values("seg4_stop")
'?'
>>> w7.get_values("seg4_start")
'?'
>>> w7.get_values("cdr3_start")
96
>>> w7.get_values("cdr3_stop")
133
>>> w7.get_values("junction_start")
93
>>> w7.get_values("junction_stop")
136
>>> w7.get_values("productive")
True
>>> w7.get_values("unavailable_axis")
'unknow_axis'
'''
### init Window with the minimum data required
def __init__(self, size):
self.d={}
......@@ -123,6 +198,88 @@ class Window:
def get_nb_reads(self, cid, point=0):
return self[cid]["reads"][point]
def get_reads(self, t=-1):
if t== -1:
return self.d["reads"]
else:
return self.d["reads"][t]
def is_segmented(self):
if not "seg" in self.d.keys():
return False
return True
def get_axes_values(self, axes, timepoint):
"""Return the values of given axes for this clone at a gvien timepoint"""
values = []
for axis in axes:
values.append(self.get_values(axis, timepoint))
return values
def get_values(self, axis, timepoint=0):
"""Return the value of an axis for this clone at a given timepoint"""
axes = {
"top": ["top"],
"germline": ["germline"],
"name": ["name"],
"reads": ["reads"],
"seg5": ["seg", "5", "name"],
"seg4": ["seg", "4", "name"],
"seg4a": ["seg", "4a", "name"],
"seg4b": ["seg", "4b", "name"],
"seg3": ["seg", "3", "name"],
"evalue": ["seg", "evalue","val"],
"evalue_left": ["seg", "evalue_left","val"],
"evalue_right": ["seg", "evalue_right","val"],
"seg5_delRight": ["seg","5","delRight"],
"seg3_delLeft": ["seg","3","delLeft"],
"seg4_delRight": ["seg","4","delRight"],
"seg4_delLeft": ["seg","4","delLeft"],
### Positions
"seg5_stop": ["seg","5","stop"],
"seg3_start": ["seg","3","start"],
"seg4_stop": ["seg","4","stop"],
"seg4_start": ["seg","4","start"],
"cdr3_stop": ["seg","cdr3","stop"],
"cdr3_start": ["seg","cdr3","start"],
"junction_stop": ["seg","junction","stop"],
"junction_start": ["seg","junction","start"],
"productive": ["seg","junction","productive"],
"lenSeqAverage" : ["_average_read_length"]
}
### length
axes_length = {
"insert_53": ["seg3_start", "seg5_stop", -1],
"insert_54": ["seg4_start", "seg5_stop", -1],
"insert_43": ["seg3_start", "seg4_stop", -1],
"lenCDR3": ["cdr3_stop", "cdr3_start", 0],
"lenJunction": ["junction_stop", "junction_start", 0]
}
try:
if axis in axes.keys():
path = axes[axis]
depth = 0
value = self.d
if path[0] == "seg" and not self.is_segmented():
return "?"
while depth != len(path):
value = value[path[depth]]
depth += 1
if type(value) is list:
# In these cases, should be a list of value at different timepoint
return value[timepoint]
return value
return "unknow_axis"
except:
return "?"
def latex(self, point=0, base_germline=10000, base=10000, tag=''):
reads = self.d["reads"][point]
ratio_germline = float(reads)/base_germline
......@@ -322,7 +479,7 @@ class ListWindows(VidjilJson):
if s*1. / self.d['reads'].d['segmented'][i] >= ranges[r]:
break
result[r][i] += s
for r in range(len(ranges)):
ratio_in_string = '{0:.10f}'.format(ranges[r]).rstrip('0')
self.d['reads'].d['distribution'][ratio_in_string] = result[r]
......@@ -442,6 +599,27 @@ class ListWindows(VidjilJson):
obj.d["reads"] = self.d["reads"] + other.d["reads"]
obj.d["diversity"] = self.d["diversity"] + other.d["diversity"]
try:
### Verify that same file is not present twice
filename_jlist1 = list(self.d["distributions"]["repertoires"].keys())
filename_jlist2 = list(other.d["distributions"]["repertoires"].keys())
for filename in filename_jlist1:
if filename in filename_jlist2:
raise( "Error, duplicate file name (in distributions) ")
### Append distributions of each files
obj.d["distributions"] = {}
obj.d["distributions"]["repertoires"] = {}
obj.d["distributions"]["keys"] = ["clones", "reads"]
obj.d["distributions"]["filters"] = {}
obj.d["distributions"]["categories"] = {}
for filename in filename_jlist1:
obj.d["distributions"]["repertoires"][filename] = self.d["distributions"]["repertoires"][filename]
for filename in filename_jlist2:
obj.d["distributions"]["repertoires"][filename] = other.d["distributions"]["repertoires"][filename]
except:
pass
return obj
###
......@@ -638,6 +816,71 @@ class ListWindows(VidjilJson):
return obj_dict
def get_filename_pos(self, filename):
""" filename is the key of distributions repertoires """
return self.d["samples"].d["original_names"].index(filename)
# ========================= #
# Distributions computing #
# ========================= #
def init_distrib(self, list_distrib):
""" Create distributions structures """
self.d["distributions"] = {}
self.d["distributions"]["repertoires"] = defaultdict(lambda:[])
self.d["distributions"]["keys"] = ["clones", "reads"]
self.d["distributions"]["filters"] = []
# Warning, will fail of there are same original_names; fixme
nb_sample = self.d["samples"].d["number"]
# prefill distribution for each file of jlist
for filename in self.d["samples"].d["original_names"]:
for distrib in list_distrib:
self.d["distributions"]["repertoires"][filename].append({"axes":distrib,"values":defaultdict(lambda: False)})
return
def compute_distribution(self, list_distrib):
""" Compute the distributions given in list_distrib """
for filename in self.d["samples"].d["original_names"]:
timepoint = self.get_filename_pos(filename)
for clone in self.d["clones"]:
if clone.d["reads"][timepoint] != 0:
for distrib_pos in range( len(self.d["distributions"]["repertoires"][filename]) ):
distrib = list_distrib[distrib_pos]
clone_values = clone.get_axes_values( distrib, timepoint )
nb_reads = clone.get_reads(self.get_filename_pos(filename))
self.add_clone_values( filename, distrib_pos, clone_values, nb_reads)
return
def add_clone_values(self, filename, distrib_pos, values, nb_reads):
""" Add value of a clone to a specific distribution """
obj = self.d["distributions"]["repertoires"][filename][distrib_pos]["values"]
obj = self.recursive_add(obj, values, nb_reads) ## Add by recursion
self.d["distributions"]["repertoires"][filename][distrib_pos]["values"] = obj
return
def recursive_add( self, obj, values, nb_reads):
"""
Add (1, nb_reads) by recursively iterating over values.
For example self.recursive_add(obj, [val1, val2], nb_reads) add (1, nb_reads) to obj[val1][val2]