Commit 30571c97 authored by Mikaël Salson's avatar Mikaël Salson

algo/core/filter: Don't have a hard limit on number of sequences.

Rather we use an uncertainty range (see #3225 for why and how).

It is necessary to get the length of the longest transferred sequence as it
will be used as a reference to compute the empirical probablity that a k-mer
has matched in the longest possible sequence. Therefore we make the assumption
that k-mers matched in the longest sequence but we have no way of knowing as
we don't keep track of where the k-mers matched.

Fixes #3225
parent 56996345
#include "filter.h"
#include "math.hpp"
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed) : originalBioReader(origin){
this->filtered_sequences_nb = 0;
......@@ -106,19 +107,31 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
set<pair<KmerAffect, int>, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor);
// Iterate over the pair and not the map
int nbKmers = 0, previousOccurences = 0;
int nb_kmers_limit = -1; // Limit number of kmers, defined when the last gene of interest is reached
for(pair<KmerAffect, int> element : setOfWords){
// Add corresponding sequences to the BioReader
if(!element.first.isGeneric()){
continue;
}
if(nbKmers == kmer_threshold && previousOccurences == element.second){
//Keep the same amount of genes
if(nbKmers == kmer_threshold && nb_kmers_limit == element.second){
// We have reached our limit of number of genes recovered but we
// continue taking sequences are they have a similar number of
// matching k-mers.
}else if(nbKmers < kmer_threshold){
nbKmers++;
}else{
break;
}
transferBioReaderSequences(originalBioReader, result, element.first);
if (nbKmers == kmer_threshold && nb_kmers_limit == -1) {
int maxlen = getSizeLongestTransferredSequence(result, element.first);
nb_kmers_limit = compute_nb_kmers_limit(element.first.getLength(), element.second, maxlen);
if (nb_kmers_limit == 0) {
this->filtered_sequences_nb += originalBioReader.size();
return originalBioReader;
}
}
previousOccurences = element.second;
}
}
......@@ -138,6 +151,22 @@ void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, Bio
}
}
int FilterWithACAutomaton::getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const{
char asciiChar = k.getLabel().at(0);
unsigned int asciiNum = int(asciiChar);
if(asciiNum > indexes->size() || !k.isGeneric()){
throw invalid_argument("Incorrect K-mer transmitted.");
}
size_t longest = 0;
for(int i = 0; i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1) - indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); ++i){
if (longest < reader.sequence(reader.size() - i - 1).length())
longest = reader.sequence(reader.size() - i - 1).length();
}
return longest;
}
vector<int>* FilterWithACAutomaton::getIndexes() const{
return this->indexes;
}
......
......@@ -114,5 +114,12 @@ class FilterWithACAutomaton {
void transferBioReaderSequences(const BioReader &src, BioReader &dst, const KmerAffect k) const;
friend ostream &operator<<(ostream&, const FilterWithACAutomaton&);
private:
/**
* Get the size of the longest sequence among the sequences that were just
* transferred to the BioReader reader.
*/
int getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const;
};
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment