From 56996345779a11c7915a4147ba0642a26b232379 Mon Sep 17 00:00:00 2001 From: Mikael Salson Date: Tue, 2 Oct 2018 16:57:38 +0200 Subject: [PATCH] algo/core/math: Add file for probability computation Compute the minimal number of k-mers of an uncertainty range whose mean is given by the empirical number of k-mers found. The probability is provided as an integer as we don't have Z-scores for every possible value and float comparison can be a bit tricky. --- algo/core/math.cpp | 26 ++++++++++++++++++++++++++ algo/core/math.hpp | 19 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 algo/core/math.cpp create mode 100644 algo/core/math.hpp diff --git a/algo/core/math.cpp b/algo/core/math.cpp new file mode 100644 index 000000000..f987a26b7 --- /dev/null +++ b/algo/core/math.cpp @@ -0,0 +1,26 @@ +#include "math.hpp" +#include +#include +#include +#include +#include "tools.h" + +using namespace std; + +const static map Z_SCORES = {{90, 1.2816}, {95, 1.6449}, {99, 2.3263}, {999, 3.0902}}; + +int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value) { + if (Z_SCORES.count(p_value) == 0) { + throw invalid_argument("You can't use a p_value of "+to_string(p_value)); + } + float empirical_proba = nb_occ * 1. / (sequence_length - kmer_size + 1); + if (empirical_proba > 1) + return nb_occ; + + float min_proba = empirical_proba - Z_SCORES.at(p_value) * sqrt(empirical_proba * (1 - empirical_proba) / (sequence_length - kmer_size + 1)); + if (min_proba <= 0) + return 0; + + // “Converts” the probability to a number of k-mers + return min_proba * (sequence_length - kmer_size + 1); +} diff --git a/algo/core/math.hpp b/algo/core/math.hpp new file mode 100644 index 000000000..975e33ba7 --- /dev/null +++ b/algo/core/math.hpp @@ -0,0 +1,19 @@ +#ifndef MATH_HPP +#define MATH_HPP + +/** + * Compute the minimal number of k-mers that is not significantly different from nb_occ + * Given that the k-mers used are those used for `affect` on the sequence `sequence`. + * @param kmer_size: size of the kmer + * @param nb_occ: Number of occurrences found on this sequence + * @param sequence_length: The sequence length + * @param p_value: The p_value to be used. The lower the p_value, the lower the value will be returned + * Beware! The p_value must be given as an integer to prevent issues with float equalities. + * Therefore a pvalue of .95 should be given as 95 and a p-value of .999 should be given as 999. + * Not all p-values are possible. Please refer to the keys of Z_SCORES. + * @return The lower bound of the confidence interval assumuing that the center of the confidence + * interval is given by `nb_occ`. In all cases the returned value will be >= 0. + */ +int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value=95); + +#endif -- 2.24.1