Commit 1973f7e8 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/some-optim' into 'dev'

Optimizations, including  e-value computations, max12

See merge request !468
parents 2ddbea56 eb8a268e
Pipeline #75869 failed with stages
in 2 minutes and 31 seconds
#include "affectanalyser.h"
#include <algorithm>
#include <unordered_map>
bool operator==(const affect_infos &ai1, const affect_infos &ai2) {
return ai1.first_pos_max == ai2.first_pos_max
......@@ -312,6 +314,33 @@ int KmerAffectAnalyser::last(const KmerAffect &affect) const{
}
pair <KmerAffect, KmerAffect> KmerAffectAnalyser::max12(const set<KmerAffect> forbidden) const {
pair<KmerAffect, int> max_counts[2] = {make_pair(KmerAffect::getUnknown(), -1),
make_pair(KmerAffect::getUnknown(), -1)};
std::unordered_map<KmerAffect, int> counts;
for (KmerAffect affect: affectations) {
if (forbidden.count(affect) == 0) {
if (counts.count(affect) > 0)
counts[affect]++;
else
counts[affect] = 1;
}
}
for (auto it: counts) {
if (it.second > max_counts[1].second) {
if (it.second > max_counts[0].second) {
max_counts[1] = max_counts[0];
max_counts[0] = it;
} else {
max_counts[1] = it;
}
}
}
return make_pair(max_counts[0].first, max_counts[1].first);
}
string KmerAffectAnalyser::toString() const{
string kmer;
for (size_t i = 0; i < affectations.size(); i++) {
......@@ -349,12 +378,8 @@ CountKmerAffectAnalyser::CountKmerAffectAnalyser(IKmerStore<KmerAffect> &kms, co
CountKmerAffectAnalyser::~CountKmerAffectAnalyser() {
set<KmerAffect> affects = this->getDistinctAffectations();
/* Initialize each key with a 0-integer array */
for (set<KmerAffect>::iterator it = affects.begin();
it != affects.end(); it++) {
delete [] counts[*it];
for (auto it : counts) {
delete [] it.second;
}
}
......@@ -390,38 +415,6 @@ KmerAffect CountKmerAffectAnalyser::max(const set<KmerAffect> forbidden) const {
return max_affect;
}
pair <KmerAffect, KmerAffect> CountKmerAffectAnalyser::max12(const set<KmerAffect> forbidden) const {
map<KmerAffect, int* >::const_iterator it = counts.begin();
KmerAffect max1_affect = KmerAffect::getUnknown();
KmerAffect max2_affect = KmerAffect::getUnknown();
int max1_count = -1;
int max2_count = -1;
for (; it != counts.end(); it++) {
if (forbidden.count(it->first) == 0) {
int current_count = count(it->first);
if (current_count > max1_count)
{
max2_affect = max1_affect ;
max2_count = max1_count ;
max1_affect = it->first ;
max1_count = current_count ;
}
else if (current_count > max2_count)
{
max2_affect = it->first;
max2_count = current_count;
}
}
}
return make_pair(max1_affect, max2_affect);
}
int CountKmerAffectAnalyser::countBefore(const KmerAffect&affect, int pos) const {
if (pos == 0 || counts.count(affect) == 0)
return 0;
......
......@@ -121,6 +121,14 @@ class AffectAnalyser {
*/
virtual int last(const KmerAffect &affect) const = 0;
/*
* @return the two affectations that are seen the most frequently in the sequence
* taken apart the forbidden ones.
* @complexity n + m log m where n is the input sequence length and m the number
* of affectations
*/
virtual pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const = 0;
/**
* @return a string representation of the object
*/
......@@ -222,6 +230,8 @@ class KmerAffectAnalyser: public AffectAnalyser {
int last(const KmerAffect &affect) const ;
pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const;
string toString() const;
string toStringValues() const;
......@@ -302,12 +312,6 @@ class CountKmerAffectAnalyser: public KmerAffectAnalyser {
*/
KmerAffect max(const set<KmerAffect> forbidden = set<KmerAffect>()) const;
/*
* @return the two affectations that are seen the most frequently in the sequence
* taken apart the forbidden ones.
*/
pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const;
/**
* Set the overlap allowed between two k-mers with two different affectations,
* when looking for the maximum.
......
......@@ -24,6 +24,7 @@ class AbstractACAutomaton: public IKmerStore<Info> {
protected:
void *initialState;
float all_index_load;
map<Info, size_t> kmers_inserted;
public:
AbstractACAutomaton();
......
......@@ -15,16 +15,16 @@ void AbstractACAutomaton<Info>::finish_building() {
IKmerStore<Info>::finish_building();
build_failure_functions();
}
all_index_load = 0;
for(auto iter: kmers_inserted) {
all_index_load += getIndexLoad(iter.first);
}
}
template<class Info>
float AbstractACAutomaton<Info>::getIndexLoad(Info kmer) const {
float load = 0;
if (kmers_inserted.count(kmer) == 0) {
for(auto iter: kmers_inserted) {
load += getIndexLoad(iter.first);
}
return (kmer.isUnknown()) ? 1 - load : load;
return (kmer.isUnknown()) ? 1 - all_index_load : all_index_load;
} else {
return kmers_inserted.at(kmer) / pow(4.0, kmer.getLength());
}
......
......@@ -126,6 +126,7 @@ BioReader::BioReader(bool virtualfasta, string name)
init(0, "");
this -> name = name;
basename = extract_basename(name);
filenames.push_back(this->name);
}
BioReader::BioReader(int extract_field, string extract_separator, int mark_pos)
......@@ -157,6 +158,7 @@ void BioReader::add(const string &filename, bool verbose) {
name += filename;
basename += extract_basename(filename);
filenames.push_back(name);
if (verbose)
cout << " <== " << filename ;
......
......@@ -189,6 +189,7 @@ public:
string name;
string basename;
list<string> filenames;
int size() const;
size_t totalSize() const;
......
......@@ -174,6 +174,17 @@ bool operator>=(const KmerAffect &a1, const KmerAffect &a2);
bool operator!=(const KmerAffect &a1, const KmerAffect &a2);
ostream &operator<<(ostream &os, const KmerAffect &kmer);
namespace std {
template <>
struct hash<KmerAffect> {
size_t operator()(const KmerAffect &affect) const {
return (affect.getLabel()[0] << 8) | affect.getLength();
}
};
}
#define AFFECT_NOT_UNKNOWN_SYMBOL "*"
#define AFFECT_AMBIGUOUS_SYMBOL "\0"
#define AFFECT_UNKNOWN_SYMBOL "\1"
......
......@@ -15,6 +15,7 @@ using namespace std;
typedef
enum { KMER_INDEX, AC_AUTOMATON } IndexTypes;
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
class Kmer {
public:
unsigned int count;
......@@ -74,6 +75,8 @@ protected:
size_t nb_kmers_inserted;
size_t max_size_indexing;
bool finished_building;
map<float, vector<double> > precomputed_proba_with_system,
precomputed_proba_without_system;
public:
......@@ -149,7 +152,7 @@ public:
/**
* @return probability that the number of kmers is 'at_least' or more in a sequence of length 'length'
*/
double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length) const;
double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length);
/**
* @return the value of k
......@@ -199,6 +202,10 @@ public:
* @pre word.length() == this->k
*/
virtual T& operator[](seqtype& word) = 0;
protected:
void precompute_proba(float index_load);
};
template<class T>
......@@ -366,17 +373,45 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) const {
void IKmerStore<T>::precompute_proba(float index_load) {
precomputed_proba_with_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
vector<double> &pproba_with = precomputed_proba_with_system[index_load];
vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad(kmer) ;
if (! precomputed_proba_without_system.count(index_load)) {
precompute_proba(index_load);
}
double proba = 0;
double probability_having_system = pow(index_load, at_least);
double probability_not_having_system = pow(1 - index_load, n - at_least);
double probability_not_having_system;
double probability_having_system;
if (precomputed_proba_with_system.at(index_load).size() > (size_t)at_least)
probability_having_system = precomputed_proba_with_system.at(index_load)[at_least];
else
probability_having_system = pow(index_load, at_least);
if (precomputed_proba_without_system.at(index_load).size() > (size_t)n - at_least)
probability_not_having_system = precomputed_proba_without_system.at(index_load)[n-at_least];
else
probability_not_having_system = pow(1 - index_load, n - at_least);
for (int i=at_least; i<=n; i++) {
proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
probability_having_system *= index_load;
......
......@@ -442,7 +442,6 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
info_extra = "seed";
segmented = false;
segmented_germline = germline ;
system = germline->code; // useful ?
reversed = false;
because = NOT_PROCESSED ; // Cause of unsegmentation
score = 0 ;
......@@ -524,7 +523,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
|| (germline->seg_method == SEG_METHOD_MAX1U))
{ // Pseudo-germline, MAX12 and MAX1U
pair <KmerAffect, KmerAffect> max12 ;
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
KmerAffectAnalyser ckaa = *kaa;
set<KmerAffect> forbidden;
......
......@@ -82,7 +82,7 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
if (seg->isJunctionChanged())
stats[SEG_CHANGED_WINDOW].insert(read_length);
stats_reads[seg->system].addScore(read_length);
stats_reads[seg->segmented_germline->code].addScore(read_length);
if (out_segmented) {
*out_segmented << *seg ; // KmerSegmenter output (V/N/J)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment