Commit ce0e3d9b authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/optimize-proba-computations' into 'dev'

optimize proba computations

See merge request !679
parents 582926c8 3111983f
Pipeline #149634 failed with stages
in 25 minutes and 2 seconds
#ifndef AUTOMATON_H
#define AUTOMATON_H
#include <list>
#include <vector>
#include <cctype>
#include "kmerstore.h"
#include <cstdlib>
......@@ -53,7 +53,7 @@ public:
/**
* @return the information stored for this state
*/
virtual list<Info> &getInfo(void *state) = 0;
virtual vector<Info> &getInfo(void *state) = 0;
/**
* @param starting_state: the starting state for traversing the automate.
......@@ -104,7 +104,7 @@ class pointer_state {
public:
pointer_state<Info> *transitions[NB_TRANSITIONS]; /* Transitions to the 5 nt */
bool is_final;
list<Info> informations; /* != NULL when is_final */
vector<Info> informations; /* != NULL when is_final */
pointer_state():is_final(false),informations() {
for (size_t i = 0; i < NB_TRANSITIONS; i++)
......@@ -164,7 +164,7 @@ public:
/**
* @return the information stored for this state
*/
list<Info> &getInfo(void *state);
vector<Info> &getInfo(void *state);
/**
* @return the automaton initial state
......
......@@ -4,6 +4,7 @@
#include "automaton.h"
#include <stack>
#include <set>
#include <list>
//////////////////// IMPLEMENTATIONS ////////////////////
template <class Info>
......@@ -160,7 +161,7 @@ void PointerACAutomaton<Info>::build_failure_functions() {
}
template <class Info>
list<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
vector<Info> &PointerACAutomaton<Info>::getInfo(void *state) {
return ((pointer_state<Info> *)state)->informations;
}
......
......@@ -178,7 +178,7 @@ namespace std {
template <>
struct hash<KmerAffect> {
size_t operator()(const KmerAffect &affect) const {
return (affect.getLabel()[0] << 8) | affect.getLength();
return (((unsigned char) affect.affect.c << 8) | (affect.getLength()));
}
};
}
......
......@@ -46,6 +46,10 @@ string Kmer::getLabel() const{
return "";
}
int Kmer::getStrand() const{
return 1;
}
size_t Kmer::getLength() const{
return 10;
}
......
......@@ -9,13 +9,13 @@
#include <math.h>
#include "bioreader.hpp"
#include "tools.h"
#include "proba.h"
using namespace std;
typedef
enum { KMER_INDEX, AC_AUTOMATON } IndexTypes;
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
class Kmer {
public:
unsigned int count;
......@@ -57,6 +57,11 @@ public:
* it only there for a reason of compatibility with KmerAffect)
*/
bool isAmbiguous() const;
/**
* @return 1 (only there for compatibility reasons with KmerAffect)
*/
int getStrand() const;
} ;
ostream &operator<<(ostream &os, const Kmer &kmer);
bool operator==(const Kmer &k1, const Kmer &k2);
......@@ -80,9 +85,8 @@ protected:
size_t nb_kmers_inserted;
size_t max_size_indexing;
bool finished_building;
map<float, vector<double> > precomputed_proba_with_system,
precomputed_proba_without_system;
ProbaPrecomputer proba;
public:
virtual ~IKmerStore();
......@@ -377,61 +381,20 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
return max_size_indexing ;
}
template<class T>
void IKmerStore<T>::precompute_proba(float index_load) {
precomputed_proba_with_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
vector<double> &pproba_with = precomputed_proba_with_system[index_load];
vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad(kmer) ;
if (! precomputed_proba_without_system.count(index_load)) {
precompute_proba(index_load);
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
if (precomputed_proba_with_system.at(index_load).size() > (size_t)at_least)
probability_having_system = precomputed_proba_with_system.at(index_load)[at_least];
else
probability_having_system = pow(index_load, at_least);
if (precomputed_proba_without_system.at(index_load).size() > (size_t)n - at_least)
probability_not_having_system = precomputed_proba_without_system.at(index_load)[n-at_least];
else
probability_not_having_system = pow(1 - index_load, n - at_least);
for (int i=at_least; i<=n; i++) {
proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
probability_having_system *= index_load;
probability_not_having_system /= (1 - index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
int kmer_length = kmer.getLength();
if (kmer.isUnknown() || kmer.isAmbiguous() || kmer == AFFECT_NOT_UNKNOWN
|| kmer.getLength() == ~0)
kmer_length = getS();
int n = max(0, length - kmer_length + 1);
at_least = min(at_least, n);
return proba.getProba(index_load, at_least, n);
}
template<class T>
int IKmerStore<T>::getK() const {
return k;
......
#include "proba.h"
#include <math.h>
#include "tools.h"
// ProbaPrecomputer::ProbaPrecomputer() {}
void ProbaPrecomputer::precomputeSystemProba(float index_load) {
precomputed_proba_with_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = std::vector<double>(MAX_PRECOMPUTED_PROBA);
std::vector<double> &pproba_with = precomputed_proba_with_system[index_load];
std::vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
precomputed_proba[index_load] = std::vector<std::vector<double> >(MAX_PRECOMPUTED_PROBA);
}
void ProbaPrecomputer::precomputeProba(float index_load, int length) {
if (length < MAX_PRECOMPUTED_PROBA) {
// By definition of MAX_PRECOMPUTED_PROBA they exist
double probability_having_system = precomputed_proba_with_system.at(index_load)[length];
double probability_not_having_system = 1;
precomputed_proba[index_load][length] = std::vector<double>(length+1);
std::vector<double> &precomp_proba = precomputed_proba[index_load][length];
double proba = 0;
precomp_proba[0] = proba;
double Cnk = 1; // nChoosek(length, length);
for (int i=length; i>=1; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
precomp_proba[i] = proba;
}
}
}
double ProbaPrecomputer::probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load) {
double proba = Cnk * proba_with * proba_without;
proba_with = getProbaWith(index_load, iteration-1); // Otherwise when probability are too low they are stored as 0 and we are f*****
proba_without *= (1 - index_load);
Cnk *= iteration*1. / (length-iteration+1);
return proba;
}
double ProbaPrecomputer::getProbaWith(double p, int nb) {
if (nb < MAX_PRECOMPUTED_PROBA)
return precomputed_proba_with_system.at(p)[nb];
return pow(p, nb);
}
double ProbaPrecomputer::getProba(float index_load, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
if (! precomputed_proba_without_system.count(index_load)) {
precomputeSystemProba(index_load);
}
if (length < MAX_PRECOMPUTED_PROBA) {
if (! precomputed_proba[index_load][length].size())
precomputeProba(index_load, length);
if (precomputed_proba[index_load][length].size() > 0) {
return precomputed_proba[index_load][length][at_least];
}
}
double proba = 0;
double probability_not_having_system;
double probability_having_system;
probability_having_system = pow(index_load, length);
probability_not_having_system = 1;
double Cnk = 1;
for (int i=length; i >= at_least; i--) {
proba += probabilityPreviousIteration(i, length, Cnk, probability_having_system,
probability_not_having_system, index_load);
}
#ifdef DEBUG_KMS_EVALUE
cerr << "e-value:\tindex_load=" << index_load << ",\tat_least=" << at_least << ",\tlength=" << length <<",\tp-value=" << proba << endl;
#endif
return proba;
}
#ifndef PROBA_H
#define PROBA_H
#include <map>
#include <vector>
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
/**
* Store probabilities for sequences whose length is < MAX_PRECOMPUTED_PROBA
*/
class ProbaPrecomputer {
private:
// Store for each index load, the probability that have been computed for
// each possiible at_least and length values. To be used as
// precomputed_proba[index_load][length][at_least]
std::map<float, std::vector<std::vector<double> > > precomputed_proba;
std::map<float, std::vector<double> > precomputed_proba_with_system, precomputed_proba_without_system;
/**
* Precompute index_load^n and (1-index_load)^n (for n=0...MAX_PRECOMPUTED_PROBA)
*/
void precomputeSystemProba(float index_load);
/**
* Precompute the results of getProba (if length < MAX_PRECOMPUTED_PROBA)
* for every possible values of at_least between 1 and length
*/
void precomputeProba(float index_load, int length);
/**
* Commpute the previous iteration for the probability (and updates the values Cnk, proba_with,
* proba_without.
*/
inline double probabilityPreviousIteration(int iteration, int length, double &Cnk, double &proba_with, double &proba_without, float index_load);
public:
/**
* @return probability that the number of kmers (with index load 'index_load')
* is 'at_least' or more in a sequence of length 'length'
*/
double getProba(float index_load, int at_least, int length);
/**
* @return p^nb (potentially pre-computed)
*/
double getProbaWith(double p, int nb);
friend void testProba1();
};
#endif
#include "tests.h"
#include <core/proba.h>
#include <chrono>
void testProba1() {
ProbaPrecomputer p;
// Very simple tests (and test precomputation)
TAP_TEST_APPROX(p.getProba(0.5, 1, 1), 0.5, 0.01, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.3, 2, 3), 0.216, 0.001, TEST_PROBA_PRECOMPUTER, ""); // See https://www.wolframalpha.com/input/?i=prob+x%3E%3D2+for+x+binomial+with+n%3D3+and+p%3D0.3
// Test precomputation
TAP_TEST_EQUAL(p.precomputed_proba[0.5].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_with_system[0.5].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_without_system[0.5].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba[0.3].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_with_system[0.3].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_without_system[0.3].size(), MAX_PRECOMPUTED_PROBA, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba.count(0.4), 0, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_with_system.count(0.4), 0, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba_without_system.count(0.4), 0, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba[0.5][1].size(), 2, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba[0.5][100].size(), 0, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.5, 1, 100), 1-pow(0.5,100), 1e-15, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba[0.5][100].size(), 101, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba.count(0.4), 0, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 99, 100), 2.43e-38, 1e-38, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_EQUAL(p.precomputed_proba[0.4][100].size(), 101, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 99, 100), 2.43e-38, 1e-38, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 90, 100), 1.730e-25, 1e-28, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 70, 100), 1.25e-9, 1e-11, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 50, 100), 0.027, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 30, 100), 0.985, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 10, 100), 1, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 1, 100), 1, 1e-3, TEST_PROBA_PRECOMPUTER, "");
// Test not pre-computed (too large for MAX_PRECOMPUTED_PROBA)
TAP_TEST(MAX_PRECOMPUTED_PROBA < 1000, TEST_PROBA_PRECOMPUTER, "If I'm failing change me AND the following test");
TAP_TEST_APPROX(p.getProba(0.4, 412, 1000), 0.229, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 912, 1000), 4e-255, 1e-100, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 800, 1000), 1.5e-147, 1e-148, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 600, 1000), 2.8e-37, 1e-38, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 370, 1000), .976, 0.001, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 300, 1000), 1, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 100, 1000), 1, 1e-3, TEST_PROBA_PRECOMPUTER, "");
TAP_TEST_APPROX(p.getProba(0.4, 10, 1000), 1, 1e-3, TEST_PROBA_PRECOMPUTER, "");
}
void testProba() {
testProba1();
}
......@@ -433,12 +433,14 @@ void testProbability(IndexTypes index) {
KmerAffectAnalyser *kaa = kseg.getKmerAffectAnalyser();
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 3), 0, TEST_PROBABILITY_SEGMENTATION, "");
// We have only 2 k-mers, thus we can't have 3 "not unknowns". Thus, it is the probability of having 2 among 2 (with p=.75), same as the test below
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 3), .75 * .75 , TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 2), .75 * .75, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 1) == .75 * 2 * .25 + kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 2), TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 0), 1, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 3), 0, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
// Same: we only have 2
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 2), .25 * .25, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 2), .25 * .25, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
TAP_TEST_EQUAL(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 0), 1, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
}
......
......@@ -18,6 +18,7 @@
#include "testReadStorage.cpp"
#include "testAutomaton.cpp"
#include "testMath.cpp"
#include "testProbability.cpp"
int main(void) {
TAP_START(NB_TESTS);
......@@ -40,6 +41,6 @@ int main(void) {
testReadStorage();
testAutomaton();
testMath();
testProba();
TAP_END_TEST_EXIT
}
......@@ -197,6 +197,9 @@ enum {
/* Bugs */
TEST_BUG2224,
// ProbaPrecomputer
TEST_PROBA_PRECOMPUTER,
NB_TESTS
};
......@@ -367,6 +370,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_FINE_SEGMENT_OVERLAP, "Test fine segmentation with an overlap");
RECORD_TAP_TEST(TEST_SEGMENT_REVCOMP, "Test segmentation on a sequence and its revcomp");
RECORD_TAP_TEST(TEST_BUG2224, "Test issue #2224 (seed longer than sequence)");
RECORD_TAP_TEST(TEST_PROBA_PRECOMPUTER, "Test ProbaPrecomputer::getProba");
}
TAP_DECLARATIONS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment