Commit 4f23358a authored by flothoni's avatar flothoni

Morisita index; update function from fuse.

* update tests
* Move results into 'overlaps' key in vidjil export
Link to #3830
parent a8b270c7
Pipeline #159811 failed with stages
in 30 seconds
......@@ -1239,21 +1239,29 @@ class ListWindows(VidjilJson):
##########################
### Morisita's index ###
##########################
def computeMorisita(self):
morisita = defaultdict( lambda: defaultdict( lambda: False ) )
jaccard = defaultdict( lambda: defaultdict( lambda: False ) )
def computeOverlaps(self):
'''
Compute, for each combinaison of sample, various overlap indexs
'''
nb_sample = self.d["samples"].d["number"]
self.d["overlaps"] = {}
self.d["overlaps"]["morisita"] = []
self.d["overlaps"]["jaccard"] = []
for pos_0 in range(0, nb_sample):
morisita = []
jaccard = []
for pos_1 in range(0, nb_sample):
morisita[pos_0][pos_1] = self.compute_one_morisita(pos_0, pos_1)
jaccard[pos_0][pos_1] = self.compute_one_Jaccard_index(pos_0, pos_1)
self.d["morisita"] = morisita
self.d["jaccard"] = jaccard
print( "Overlap: %s vs %s" % (pos_0, pos_1) )
morisita.append( self.computeOverlapMorisita(pos_0, pos_1) )
jaccard.append( self.computeOverlapJaccard(pos_0, pos_1) )
self.d["overlaps"]["morisita"].append( morisita)
self.d["overlaps"]["jaccard"].append( jaccard )
return
def compute_one_morisita(self, pos_0, pos_1):
def computeOverlapMorisita(self, pos_0, pos_1):
"""
Morisita-Horn similarity index
This index apply to quantitative data.
......@@ -1268,6 +1276,7 @@ class ListWindows(VidjilJson):
# Values between 0 (completly different communityes) and 1 (maximal similarity).
# 2 groups are similare (poor diversity) if CMH value is superior to 0.5
# and dissemblables if value is under 0,5 (high diversity).
!!! Computed only on present clones (so should be run with `-Y all`)
"""
index_div = "index_Ds_diversity"
clones = self.d["clones"]
......@@ -1283,17 +1292,19 @@ class ListWindows(VidjilJson):
ai = clone.d["reads"][pos_0]
bi = clone.d["reads"][pos_1]
m += (ai * bi)
da += ( (ai*ai) / Na )
db += ( (bi*bi) / Nb )
da += (ai*ai)
db += (bi*bi)
m *= 2
d = ( (da/Na)+(db/Nb))*(Na*Nb)
m = m/d
d = (da+db)/2
if m == 0: #if really no shared clones
res = 0
else:
res = round( (m/d), 3)
return m
return res
def compute_one_Jaccard_index(self, pos_0, pos_1):
def computeOverlapJaccard(self, pos_0, pos_1):
"""
Jaccard similarity index
Formula :
......@@ -1315,8 +1326,10 @@ class ListWindows(VidjilJson):
bi = clone.d["reads"][pos_1]
Nc += bool(ai * bi)
# print( "Nc: %s" % Nc)
if Nc == 0: # if really no shared clones
return 0
I = Nc / (N1 + N2 - Nc)
I = round( (Nc / (N1 + N2 - Nc)), 3)
return I
......@@ -1652,7 +1665,7 @@ def main():
print("### Morisita")
jlist_fused.computeMorisita()
jlist_fused.computeOverlaps()
if args.no_clones:
# TODO: do not generate the list of clones in this case
......
This diff is collapsed.
This diff is collapsed.
{
"notes":"Sample with clone 1-5; default reads to 1000",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
1000
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
1000
],
"top": 2
}, {
"germline": "IGK",
"id": "clone_3",
"reads": [
1000
],
"top": 3
}, {
"germline": "IGK",
"id": "clone_4",
"reads": [
1000
],
"top": 4
}, {
"germline": "IGK",
"id": "clone_5",
"reads": [
1000
],
"top": 5
}, {
"germline": "IGK",
"id": "clone_6",
"reads": [
0
],
"top": 6
}, {
"germline": "IGK",
"id": "clone_7",
"reads": [
0
],
"top": 7
}, {
"germline": "IGK",
"id": "clone_8",
"reads": [
0
],
"top": 8
}
],
"clusters": [],
"diversity": {
"index_Ds_diversity": [
0.999773621559143
],
"index_E_equitability": [
0.718931078910828
],
"index_H_entropy": [
10.0674523124544
]
},
"germlines": {},
"producer": "vidjil fuse",
"reads": {
"germline": {
"IGK": [
9500
]
},
"segmented": [
9500
],
"total": [
9500
]
},
"samples": {
"commandline": [
"cmdline_1"
],
"log": [
"log_1"
],
"number": 1,
"original_names": [
"seq_1"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2018-03-28 10:26:11"
],
"timestamp": [
"2018-04-06 13:48:48"
]
},
"timestamp": "2019-04-08 15:18:00",
"vidjil_json_version": "2016b",
"warn": [
]
}
\ No newline at end of file
{
"notes":"Sample with clone 1-5; default reads to 500 (half sample 1)",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
500
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
500
],
"top": 2
}, {
"germline": "IGK",
"id": "clone_3",
"reads": [
500
],
"top": 3
}, {
"germline": "IGK",
"id": "clone_4",
"reads": [
500
],
"top": 4
}, {
"germline": "IGK",
"id": "clone_5",
"reads": [
500
],
"top": 5
}, {
"germline": "IGK",
"id": "clone_6",
"reads": [
0
],
"top": 6
}, {
"germline": "IGK",
"id": "clone_7",
"reads": [
0
],
"top": 7
}, {
"germline": "IGK",
"id": "clone_8",
"reads": [
0
],
"top": 8
}
],
"clusters": [],
"diversity": {
"index_Ds_diversity": [
0.999773621559143
],
"index_E_equitability": [
0.718931078910828
],
"index_H_entropy": [
10.0674523124544
]
},
"germlines": {},
"producer": "vidjil fuse",
"reads": {
"germline": {
"IGK": [
9500
]
},
"segmented": [
9500
],
"total": [
9500
]
},
"samples": {
"commandline": [
"cmdline_2"
],
"log": [
"log_2"
],
"number": 1,
"original_names": [
"seq_2"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2018-03-28 10:26:11"
],
"timestamp": [
"2018-04-06 13:48:48"
]
},
"timestamp": "2019-04-08 15:18:00",
"vidjil_json_version": "2016b",
"warn": [
]
}
\ No newline at end of file
{
"notes":"Sample with clone 1-4; default reads to 1000 (diff on clone 5 compare to sample 1)",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
1000
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
1000
],
"top": 2
}, {
"germline": "IGK",
"id": "clone_3",
"reads": [
1000
],
"top": 3
}, {
"germline": "IGK",
"id": "clone_4",
"reads": [
1000
],
"top": 4
}, {
"germline": "IGK",
"id": "clone_5",
"reads": [
0
],
"top": 5
}, {
"germline": "IGK",
"id": "clone_6",
"reads": [
0
],
"top": 6
}, {
"germline": "IGK",
"id": "clone_7",
"reads": [
0
],
"top": 7
}, {
"germline": "IGK",
"id": "clone_8",
"reads": [
0
],
"top": 8
}
],
"clusters": [],
"diversity": {
"index_Ds_diversity": [
0.999773621559143
],
"index_E_equitability": [
0.718931078910828
],
"index_H_entropy": [
10.0674523124544
]
},
"germlines": {},
"producer": "vidjil fuse",
"reads": {
"germline": {
"IGK": [
9500
]
},
"segmented": [
9500
],
"total": [
9500
]
},
"samples": {
"commandline": [
"cmdline_3"
],
"log": [
"log_3"
],
"number": 1,
"original_names": [
"seq_3"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2018-03-28 10:26:11"
],
"timestamp": [
"2018-04-06 13:48:48"
]
},
"timestamp": "2019-04-08 15:18:00",
"vidjil_json_version": "2016b",
"warn": [
]
}
\ No newline at end of file
{
"notes":"Sample with clone 5-8; default reads to 1000",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
0
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
0
],
"top": 2
}, {
"germline": "IGK",
"id": "clone_3",
"reads": [
0
],
"top": 3
}, {
"germline": "IGK",
"id": "clone_4",
"reads": [
0
],
"top": 4
}, {
"germline": "IGK",
"id": "clone_5",
"reads": [
1000
],
"top": 5
}, {
"germline": "IGK",
"id": "clone_6",
"reads": [
1000
],
"top": 6
}, {
"germline": "IGK",
"id": "clone_7",
"reads": [
1000
],
"top": 7
}, {
"germline": "IGK",
"id": "clone_8",
"reads": [
1000
],
"top": 8
}
],
"clusters": [],
"diversity": {
"index_Ds_diversity": [
0.999773621559143
],
"index_E_equitability": [
0.718931078910828
],
"index_H_entropy": [
10.0674523124544
]
},
"germlines": {},
"producer": "vidjil fuse",
"reads": {
"germline": {
"IGK": [
9500
]
},
"segmented": [
9500
],
"total": [
9500
]
},
"samples": {
"commandline": [
"cmdline_4"
],
"log": [
"log_4"
],
"number": 1,
"original_names": [
"seq_4"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2018-03-28 10:26:11"
],
"timestamp": [
"2018-04-06 13:48:48"
]
},
"timestamp": "2019-04-08 15:18:00",
"vidjil_json_version": "2016b",
"warn": [
]
}
\ No newline at end of file
{
"notes":"Sample with clone 5-8; default reads to 1000",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
1000
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
900
],
"top": 2
}, {
"germline": "IGK",
"id": "clone_3",
"reads": [
800
],
"top": 3
}, {
"germline": "IGK",
"id": "clone_4",
"reads": [
700
],
"top": 4
}, {
"germline": "IGK",
"id": "clone_5",
"reads": [
600
],
"top": 5
}, {
"germline": "IGK",
"id": "clone_6",
"reads": [
500
],
"top": 6
}, {
"germline": "IGK",
"id": "clone_7",
"reads": [
400
],
"top": 7
}, {
"germline": "IGK",
"id": "clone_8",
"reads": [
300
],
"top": 8
}, {
"germline": "IGK",
"id": "clone_9",
"reads": [
200
],
"top": 9
}, {
"germline": "IGK",
"id": "clone_10",
"reads": [
100
],
"top": 1
}
],
"clusters": [],
"diversity": {
"index_Ds_diversity": [
0.999773621559143
],
"index_E_equitability": [
0.718931078910828
],
"index_H_entropy": [
10.0674523124544
]
},
"germlines": {},
"producer": "vidjil fuse",
"reads": {
"germline": {
"IGK": [
9500
]
},
"segmented": [
9500
],
"total": [
9500
]
},
"samples": {
"commandline": [
"cmdline_gradient_dec"
],
"log": [
"log_gradient_dec"
],
"number": 1,
"original_names": [
"seq_gradient_dec"
],
"producer": [
"vidjil-algo 2018.02"
],
"run_timestamp": [
"2018-03-28 10:26:11"
],
"timestamp": [
"2018-04-06 13:48:48"
]
},
"timestamp": "2019-04-08 15:18:00",
"vidjil_json_version": "2016b",
"warn": [
]
}
\ No newline at end of file
{
"notes":"Sample with clone 5-8; default reads to 1000",
"clones": [
{
"germline": "IGK",
"id": "clone_1",
"reads": [
100
],
"top": 1
}, {
"germline": "IGK",
"id": "clone_2",
"reads": [
200
],