Commit 59ce922d authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/3259-output-filtered-bioreader-informations' into 'dev'

Feature a/3259 output filtered bioreader informations

Closes #3259

See merge request !231
parents 7cf92edc 84b59d7a
Pipeline #31676 passed with stages
in 6 minutes and 31 seconds
#include "filter.h"
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed){
buildACAutomatonToFilterBioReader(origin, seed);
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed) : originalBioReader(origin){
this->filtered_sequences_nb = 0;
this->filtered_sequences_calls = 0;
buildACAutomatonToFilterBioReader(seed);
}
FilterWithACAutomaton::~FilterWithACAutomaton(){
......@@ -13,14 +15,13 @@ FilterWithACAutomaton::~FilterWithACAutomaton(){
}
}
void FilterWithACAutomaton::buildACAutomatonToFilterBioReader
(BioReader &origin, string seed){
void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){
char asciiChar;
int asciiNumber;
string currentLabel;
string previousLabel;
if(origin.size() < 1){
if(originalBioReader.size() < 1){
automaton = nullptr;
indexes = nullptr;
return;
......@@ -28,11 +29,11 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader
automaton = new PointerACAutomaton<KmerAffect>(seed, false, true);
indexes = new vector<int>();
asciiNumber = SPECIFIC_KMERS_NUMBER;
automaton->insert(origin.sequence(0),std::string("") + char(asciiNumber), true, 0, seed);
automaton->insert(originalBioReader.sequence(0),std::string("") + char(asciiNumber), true, 0, seed);
indexes->push_back(0);
previousLabel = extractGeneName(origin.label(0));
for(int i = 1;i < origin.size(); ++i){
currentLabel = extractGeneName(origin.label(i));
previousLabel = extractGeneName(originalBioReader.label(0));
for(int i = 1;i < originalBioReader.size(); ++i){
currentLabel = extractGeneName(originalBioReader.label(i));
if(currentLabel != previousLabel){
indexes->push_back(i);
asciiNumber++;
......@@ -44,10 +45,10 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader
return;
}
asciiChar = char(asciiNumber);
automaton->insert(origin.sequence(i),std::string("") + asciiChar, true, 0, seed);
automaton->insert(originalBioReader.sequence(i),std::string("") + asciiChar, true, 0, seed);
previousLabel = currentLabel;
}
indexes->push_back(origin.size());
indexes->push_back(originalBioReader.size());
automaton->build_failure_functions();
}
......@@ -56,13 +57,14 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader
based on it.
*/
BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
BioReader &origin, seqtype &seq,
int kmer_threshold){
seqtype &seq, int kmer_threshold){
BioReader result;
map<KmerAffect, int> mapAho;
this->filtered_sequences_calls += 1;
if(!automaton || !indexes || kmer_threshold < 0){
return origin;
this->filtered_sequences_nb += originalBioReader.size();
return originalBioReader;
}
mapAho = automaton->getMultiResults(seq);
......@@ -70,7 +72,7 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
if(kmer_threshold == ALL_KMERS_VALUE || kmer_threshold > (int)mapAho.size()){
for(auto const mx: mapAho){
if(mx.first.isGeneric()){
transferBioReaderSequences(origin, result, mx.first);
transferBioReaderSequences(originalBioReader, result, mx.first);
}
}
/* The most significant k-mers selected : iterate over a portion of the
......@@ -97,11 +99,12 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
}else{
break;
}
transferBioReaderSequences(origin, result, element.first);
transferBioReaderSequences(originalBioReader, result, element.first);
previousOccurences = element.second;
}
}
return (result.size() == 0) ? origin : result;
this->filtered_sequences_nb += (result.size () == 0) ? originalBioReader.size() : result.size();
return (result.size() == 0) ? originalBioReader : result;
}
void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, BioReader &dst, KmerAffect k) const{
......@@ -123,3 +126,18 @@ vector<int>* FilterWithACAutomaton::getIndexes() const{
AbstractACAutomaton<KmerAffect>* FilterWithACAutomaton::getAutomaton() const{
return this->automaton;
}
ostream &operator<<(ostream &out, const FilterWithACAutomaton& obj){
int origin_bioreader_size = obj.originalBioReader.size();
int total_sequences_filtered = obj.filtered_sequences_nb;
int total_filtered_calls = obj.filtered_sequences_calls;
int total_sequences_origin = total_filtered_calls * origin_bioreader_size;
float aligned_rate = ((float)total_sequences_filtered/(float)total_sequences_origin) * 100;
out << fixed << setw(8) << total_sequences_filtered << "/"
<< fixed << setw(8) << total_sequences_origin << " "
<< fixed << setprecision(1) << setw(6) << aligned_rate << "%"
<< endl ;
return out ;
}
......@@ -4,12 +4,22 @@
#include "automaton.hpp"
class FilterWithACAutomaton {
private:
vector<int>* indexes;
AbstractACAutomaton<KmerAffect>* automaton;
public:
BioReader &originalBioReader;
/* The number of times filterBioReaderWithACAutomaton is called. */
int filtered_sequences_calls;
/* The size of the BioReader returned after filtering.*/
int filtered_sequences_nb;
FilterWithACAutomaton(BioReader &origin, string seed);
~FilterWithACAutomaton();
/*
......@@ -31,8 +41,7 @@ class FilterWithACAutomaton {
significant K-mers returned by getMultiResults.
*/
BioReader filterBioReaderWithACAutomaton(
BioReader &origin, seqtype &seq,
int kmer_threshold = NO_LIMIT_VALUE);
seqtype &seq, int kmer_threshold = NO_LIMIT_VALUE);
/*
This function takes a BioReader as a parameter and returns
a couple containing an int vector pointer and an automaton
......@@ -81,7 +90,7 @@ class FilterWithACAutomaton {
The param "seed" is used while inserting sequences in the automaton. By default
the seed has a size of 10.
*/
void buildACAutomatonToFilterBioReader(BioReader &origin, string seed);
void buildACAutomatonToFilterBioReader(string seed);
/**
* Return the vector of indexes used while building the automaton.
......@@ -103,5 +112,7 @@ class FilterWithACAutomaton {
* about how the label is used, see buildACAutomatonToFilterBioReader's doc.
*/
void transferBioReaderSequences(const BioReader &src, BioReader &dst, const KmerAffect k) const;
friend ostream &operator<<(ostream&, const FilterWithACAutomaton&);
};
#endif
......@@ -1051,7 +1051,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
/* Regular 53 Segmentation */
if(kmer_threshold != NO_LIMIT_VALUE){
FilterWithACAutomaton* f = germline->getFilter_5();
BioReader filtered = f->filterBioReaderWithACAutomaton(germline->rep_5, sequence_or_rc, kmer_threshold);
BioReader filtered = f->filterBioReaderWithACAutomaton(sequence_or_rc, kmer_threshold);
align_against_collection(sequence_or_rc, filtered, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
box_V, segment_cost);
}else{
......
......@@ -288,9 +288,9 @@ void testFilterBioReaderWithACAutomaton(){
a2 = f2->getAutomaton();
a3 = f3->getAutomaton();
filteredBioReader1 = f1->filterBioReaderWithACAutomaton(testedBioReader1, sequence1);
filteredBioReader2 = f2->filterBioReaderWithACAutomaton(testedBioReader2, sequence2);
filteredBioReader3 = f3->filterBioReaderWithACAutomaton(testedBioReader3, sequence3);
filteredBioReader1 = f1->filterBioReaderWithACAutomaton(sequence1);
filteredBioReader2 = f2->filterBioReaderWithACAutomaton(sequence2);
filteredBioReader3 = f3->filterBioReaderWithACAutomaton(sequence3);
//check filteredBioReader size
TAP_TEST(filteredBioReader1.size() <= testedBioReader1.size(),
......@@ -353,7 +353,7 @@ void testGetNSignicativeKmers(){
for(int i = 0; i < seqV.size(); ++i){
Sequence seq = seqV.read(i);
FilterWithACAutomaton *f = new FilterWithACAutomaton(seqV, "########");
filtered = f->filterBioReaderWithACAutomaton(seqV, seq.sequence, 1);
filtered = f->filterBioReaderWithACAutomaton(seq.sequence, 1);
delete f;
int j = 0;
while(j < filtered.size()){
......@@ -406,7 +406,7 @@ void testExAequoKmersWhenSignificantParameter(){
/* Filter using the 2 most significant K-mers, the first one is belonging to
sequence n°11 (with more than 60 occurences) and second one is sequence n°5
and n°10 appearing 29 times both. */
filtered = f->filterBioReaderWithACAutomaton(testedBioReader, seq, 2);
filtered = f->filterBioReaderWithACAutomaton(seq, 2);
/* Check that filtered BioReader contains sequence n°5 and sequence n°10 which are ex-aequo. */
int i = 0;
while(i < filtered.size() && extractGeneName(filtered.label(i)) != extractGeneName(testedBioReader.label(5))){
......@@ -431,7 +431,7 @@ void testExAequoKmersWhenSignificantParameter(){
seq += "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
delete f;
f = new FilterWithACAutomaton(testedBioReader, "####");
filtered = f->filterBioReaderWithACAutomaton(testedBioReader, seq, 2);
filtered = f->filterBioReaderWithACAutomaton(seq, 2);
k = 0;
while(k < filtered.size() && extractGeneName(filtered.label(k)) != extractGeneName(testedBioReader.label(12))){
++k;
......
......@@ -1622,7 +1622,19 @@ int main (int argc, char **argv)
cerr << "Ooops... unknown command. I don't know what to do apart from exiting!" << endl;
return 1;
}
//$ Output statistics on filter()
if(kmer_threshold != NO_LIMIT_VALUE){
cout << "Statistics on clone analysis (-Z):" << endl;
for(list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it){
FilterWithACAutomaton *f = (*it)->getFilter_5();
if (f)
if (f->filtered_sequences_nb)
cout << "\t" << (*it)->code << "\t" << *f;
}
cout << endl;
}
//$ Output json
cout << " ==> " << f_json << "\t(data file for the web application)" << endl ;
ofstream out_json(f_json.c_str()) ;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment