diff --git a/algo/core/segment.h b/algo/core/segment.h index 4641e0baa3064c5670bc40a9c3396a338f64de3e..a9e0e4bc5777375fc5ec2830f3c7b2d65d4501b6 100644 --- a/algo/core/segment.h +++ b/algo/core/segment.h @@ -89,6 +89,8 @@ const char* const segmented_mesg[] = { "?", "UNSEG too short w", } ; +#define ALL_LOCI "all" + // Unproductivity causes #define UNPROD_TOO_SHORT "too-short" #define UNPROD_OUT_OF_FRAME "out-of-frame" diff --git a/algo/core/windows.cpp b/algo/core/windows.cpp index 5366dfa4e06393651627921f8f886a321b471ca8..d1c6730653756639215f006dc4ff28a3b2ca9896 100644 --- a/algo/core/windows.cpp +++ b/algo/core/windows.cpp @@ -233,35 +233,66 @@ ostream &WindowsStorage::printSortedWindows(ostream &os) { -json WindowsStorage::computeDiversity(int nb_segmented) { +json WindowsStorage::computeDiversity(map nb_segmented) { - double index_H_entropy = 0.0 ; - double index_1_minus_Ds_diversity = 0.0 ; + map index_H_entropy; + map index_1_minus_Ds_diversity; - double nb_seg_nb_seg_m1 = (double) nb_segmented * ((double) nb_segmented - 1); - - for (auto it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) { + for (auto it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) + { size_t clone_nb_reads = it->second.getNbInserted(); + string code = getGermline(it->first)->code; - float ratio = (float) clone_nb_reads / nb_segmented ; - index_H_entropy -= ratio * log(ratio) ; + float ratio_all = (float) clone_nb_reads / nb_segmented[ALL_LOCI] ; + float ratio_code = (float) clone_nb_reads / nb_segmented[code] ; + index_H_entropy[ALL_LOCI] -= ratio_all * log(ratio_all) ; + index_H_entropy[code] -= ratio_code * log(ratio_code) ; - index_1_minus_Ds_diversity += ((double) clone_nb_reads * ((double) clone_nb_reads - 1)) / nb_seg_nb_seg_m1 ; + double inc_diversity = ((double) clone_nb_reads * ((double) clone_nb_reads - 1)); + index_1_minus_Ds_diversity[ALL_LOCI] += inc_diversity; + index_1_minus_Ds_diversity[code] += inc_diversity; } - float index_E_equitability = index_H_entropy / log(nb_segmented) ; - float index_Ds_diversity = 1 - index_1_minus_Ds_diversity ; + json jsonDiversity; - cout << "Diversity measures" << endl - << " H = " << index_H_entropy << endl // Shannon's diversity - << " E = " << index_E_equitability << endl // Shannon's equitability - << " Ds = " << index_Ds_diversity << endl // Simpson's diversity - << endl; + for (const auto& kv: index_H_entropy) + { + string code = kv.first ; - json jsonDiversity; - jsonDiversity["index_H_entropy"] = index_H_entropy ; - jsonDiversity["index_E_equitability"] = index_E_equitability ; - jsonDiversity["index_Ds_diversity"] = index_Ds_diversity ; + // Only one read + if (nb_segmented[code] <= 1) + continue ; + + // Shannon's diversity + jsonDiversity["index_H_entropy"][code] = kv.second; + + // Shannon's equitability + double nb_seg_nb_seg_m1 = nb_segmented[code] * (nb_segmented[code] - 1); + jsonDiversity["index_E_equitability"][code] = index_H_entropy[code] / log(nb_segmented[code]) ; + + // Simpson's diversity + jsonDiversity["index_Ds_diversity"][code] = 1 - index_1_minus_Ds_diversity[code] / nb_seg_nb_seg_m1 ; + } + + // Pretty-print + cout << setw(24) << "Diversity measures" ; + for (const auto& kv: index_H_entropy) + cout << setw(6) << kv.first ; + cout << endl; + + for (const string index: {"index_H_entropy", "index_E_equitability", "index_Ds_diversity"}) + { + cout << " " << setw(22) << index ; + for (const auto& kv: index_H_entropy) + { + cout << setw(6) ; + if (jsonDiversity[index].contains(kv.first)) + cout << fixed << setprecision(3) << (float) jsonDiversity[index][kv.first]; + else + cout << "-" ; + } + cout << endl; + } return jsonDiversity; } diff --git a/algo/core/windows.h b/algo/core/windows.h index e4f65a89263ce70961a5c45bdb0fa1ecb3657478..9c73eeeb0dc845f65d7093b121206d286c173fcf 100644 --- a/algo/core/windows.h +++ b/algo/core/windows.h @@ -192,7 +192,7 @@ class WindowsStorage { * @pre should be called before keepInterestingWindows() * Compute, display, and return some diversity measures */ - json computeDiversity(int nb_segmented); + json computeDiversity(map nb_segmented_by_germline); /** * @pre sort() must have been called. diff --git a/algo/tests/data/diversity.fa b/algo/tests/data/diversity.fa new file mode 100644 index 0000000000000000000000000000000000000000..0a8ee1ad69d8ccb8f61f7367134ad7072f648c56 --- /dev/null +++ b/algo/tests/data/diversity.fa @@ -0,0 +1,61 @@ + +# TRA: no diversity, 3 reads / 1 clone + +# 1 - Ds = 1 +# H = 0 +# E = 0 + +>TRA--V1-1*01--J1*01--1 +cctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgtgagaga +gtatgaaagtattacctcccagttgcaatttggcaaaggaaccagagtttccacttctcc +>TRA--V1-1*01--J1*01--2 +cctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgtgagaga +gtatgaaagtattacctcccagttgcaatttggcaaaggaaccagagtttccacttctcc +>TRA--V1-1*01--J1*01--3 +cctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgtgagaga +gtatgaaagtattacctcccagttgcaatttggcaaaggaaccagagtttccacttctcc + +# TRG: full diversity, 3 reads / 3 clones + +# 1 - Ds = 0 +# H = 3 * (- 1/3 ln 1/3) = ln 3 = 1.09861... +# E = H / ln 3 = 1 + +>TRG--V1*01--J1*01--1 +agactgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacagg +gaattattataagaaactctttggcagtggaacaacactggttgtcacag +>TRG--V1*01--ACT--J1*01--2 +agactgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacagg +ACT +gaattattataagaaactctttggcagtggaacaacactggttgtcacag +>TRG--V1*01--GG--J1*01--3 +agactgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacagg +GG +gaattattataagaaactctttggcagtggaacaacactggttgtcacag + +# IGK: 1 clone with 2 reads, 1 clone with 1 read + +# 1 - Ds = (2*1 + 1*0) / (3*2) = 1/3 +# H = - 2/3 ln 2/3 - 1/3 ln 1/3 = 0.63651... +# E = H / ln 3 = 0.57938... + +>IGK--V1-12*01--J1*01--1 +cagcctgcagcctgaagattttgcaacttactattgtcaacaggctaacagtttccctcc +gtggacgttcggccaagggaccaaggtggaaatcaaac +>IGK--V1-12*01--J1*01--2 +cagcctgcagcctgaagattttgcaacttactattgtcaacaggctaacagtttccctcc +gtggacgttcggccaagggaccaaggtggaaatcaaac +>IGK--V1-12*01--CCG--J1*01--3 +cagcctgcagcctgaagattttgcaacttactattgtcaacaggctaacagtttccctcc +CCG +gtggacgttcggccaagggaccaaggtggaaatcaaac + +# IGL: only 1 read, some indices are not defined + +# 1 - Ds = 0 +# H and E not defined + +>IGLV2-14*01 12/4/0 IGLJ3*02 +TAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGG +GCTCCAGGCGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCTTTCTT +GGGTGTTCGGCGGAGGGACCAAGCTG \ No newline at end of file diff --git a/algo/tests/should-get-tests/diversity-index.should b/algo/tests/should-get-tests/diversity-index.should new file mode 100644 index 0000000000000000000000000000000000000000..d8f802a2f16a0a7075a5d2668456c98bea39b0bf --- /dev/null +++ b/algo/tests/should-get-tests/diversity-index.should @@ -0,0 +1,35 @@ + +!LAUNCH: $VIDJIL_DIR/$EXEC -c clones -g $VIDJIL_DIR/germline/homo-sapiens.g:IGK ../data/diversity.fa + +$ Focusing on IGK recombination, three reads are detected +1: junction detected in 3 reads + +$ Diversity measures are correct +1:index_Ds_diversity 0.667 0.667 +1:index_H_entropy 0.637 0.637 +1:index_E_equitability 0.579 0.579 + + +!LAUNCH: $VIDJIL_DIR/$EXEC -c clones -g $VIDJIL_DIR/germline/homo-sapiens.g:TRA,TRG,IGK,IGL ../data/diversity.fa + +$ Reads are all detected +1: junction detected in 10 reads + +$ Diversity measures are reported by germline +1:Diversity measures IGK IGL TRA TRG + +$ Diversity measures are correct +1:index_Ds_diversity 0.667 - 0.000 1.000 +1:index_H_entropy 0.637 - 0.000 1.099 +1:index_E_equitability 0.579 - 0.000 1.000 + + +!NO_LAUNCHER: +!LAUNCH: cat out/diversity.vidjil +!OPTIONS: --mod jR + +$ Diversity measures are reported in the json, except when there are not defined +:diversity.index_Ds_diversity.all: 0\.911 +:diversity.index_H_entropy.IGK: 0\.63[67] +:diversity.index_E_equitability.TRA: 0\.0 +0:diversity.index_E_equitability.IGL: diff --git a/algo/tests/should-get-tests/large-r.should b/algo/tests/should-get-tests/large-r.should index 31a50de500912b694afab68b7ae699a5c7a3763d..f9edcf20943133b03aae9fbd51325953fc1f714a 100644 --- a/algo/tests/should-get-tests/large-r.should +++ b/algo/tests/should-get-tests/large-r.should @@ -8,5 +8,5 @@ $ Find a representative $ Compute the diversity. No diversity here. 1: 1 0.000 -1: E = 0.000 -1: Ds = 0.000 +1: E_.* 0.000 +1: Ds_.* 0.000 diff --git a/algo/tests/should-get-tests/multi-detect.should b/algo/tests/should-get-tests/multi-detect.should index c7bf952849d720c472418c27d60b24953ac1bd50..b77f295344d44711c91fb71dc9644e6bff4a8700 100644 --- a/algo/tests/should-get-tests/multi-detect.should +++ b/algo/tests/should-get-tests/multi-detect.should @@ -26,8 +26,8 @@ $ Detect one read on IGL $ Compute the diversity. All windows have only one read, full diversity. 7: 1 1.000 -1: E = 1.000 -1: Ds = 1.000 +1: E.* 1.000 +1: Ds.* 1.000 ### Focusing on Ig recombinations with -g:IGH,IGK,IGL !LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa diff --git a/algo/vidjil.cpp b/algo/vidjil.cpp index ca2f6f43c22d5441fcd4d016dc944e4bf8a220b9..f757a15f75b5ef6c73c38e2675ab3b616fbbb9f7 100644 --- a/algo/vidjil.cpp +++ b/algo/vidjil.cpp @@ -1298,7 +1298,17 @@ int main (int argc, char **argv) cout << endl; //$$ compute, display and store diversity measures - json jsonDiversity = windowsStorage->computeDiversity(nb_segmented); + json reads_germline; + map nb_segmented_by_germline; + for (list::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it){ + Germline *germline = *it ; + size_t nb = we.getNbReadsGermline(germline->code); + nb_segmented_by_germline[germline->code] = nb; + reads_germline[germline->code] = {nb}; + } + + nb_segmented_by_germline[ALL_LOCI] = nb_segmented; + json jsonDiversity = windowsStorage->computeDiversity(nb_segmented_by_germline); ////////////////////////////////// //$$ min_reads_clone (ou label) @@ -1764,13 +1774,6 @@ int main (int argc, char **argv) windowsStorage->clearSequences(); windowsStorage->sortedWindowsToOutput(&output, max_clones_id); - - json reads_germline; - for (list::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it){ - Germline *germline = *it ; - reads_germline[germline->code] = {we.getNbReadsGermline(germline->code)}; - } - // Complete main output output.set("config", j_config); diff --git a/doc/vidjil-algo.md b/doc/vidjil-algo.md index 3e798a9efecad2b88e68e6a1c8d5c2a555369491..a58123b64bfe5a105118adc525858b92573e62c3 100644 --- a/doc/vidjil-algo.md +++ b/doc/vidjil-algo.md @@ -675,7 +675,8 @@ The `--out-reads` option produces large files, and is not recommended in general ## Diversity measures -Several [diversity indices](https://en.wikipedia.org/wiki/Diversity_index) are reported, both on the standard output and in the `.vidjil` file: +Several [diversity indices](https://en.wikipedia.org/wiki/Diversity_index) are reported, both on the standard output and in the `.vidjil` file, +for each germline/locus as well as for the entire data: - H (`index_H_entropy`): Shannon's diversity - E (`index_E_equitability`): Shannon's equitability @@ -683,7 +684,7 @@ Several [diversity indices](https://en.wikipedia.org/wiki/Diversity_index) are r E ans Ds values are between 0 (no diversity, one clone clusters all analyzed reads) and 1 (full diversity, each analyzed read belongs to a different clone). -These values are now computed on the windows, before any further clustering. +These values are computed on the full list of clones, before any further clustering. PCR and sequencing errors can thus lead to slightly over-estimate the diversity. ## Reads without detected recombinations