diff --git a/algo/core/filter.cpp b/algo/core/filter.cpp index 41227348fc0d31133de528bce6991bbf08485531..3ecf76ff46fde07f6c5a0743ce91a8f1d56a8d9d 100644 --- a/algo/core/filter.cpp +++ b/algo/core/filter.cpp @@ -90,19 +90,15 @@ BioReader filterBioReaderWithACAutomaton( set, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor); set, Comparator>::iterator setIt = setOfWords.begin(); // Iterate over the pair and not the map - int nbKmers = 0; + int nbKmers = 0, previousOccurences = 0; for(pair element : setOfWords){ // Add corresponding sequences to the BioReader - if(nbKmers < kmer_threshold){ - nbKmers++; - /* Check if next K-mer has same occurence */ - if(nbKmers == kmer_threshold){ - std::advance(setIt, nbKmers); - pair pNext = *setIt; - int nextKmerOccurs = pNext.second; - if(nextKmerOccurs == element.second){ - nbKmers--; - } + if(nbKmers == kmer_threshold && previousOccurences == element.second){ + //Keep the same amount of genes + }else if(nbKmers < kmer_threshold){ + nbKmers++; + }else{ + break; } tmpKmer = element.first; asciiChar = tmpKmer.getLabel().at(0); @@ -113,11 +109,7 @@ BioReader filterBioReaderWithACAutomaton( for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){ result.add(origin.read(i)); } - } - else{ - /* Enough K-mers used for filtering, no need to go further */ - break; - } + previousOccurences = element.second; } } return (result.size() == 0) ? origin : result; diff --git a/algo/tests/unit-tests/testFilter.cpp b/algo/tests/unit-tests/testFilter.cpp index 90645ae315339c2d96eef8fec756c3a11aeb0ce4..fc3ba609e1870bbf2bcb2572727bdc6e171e75b0 100644 --- a/algo/tests/unit-tests/testFilter.cpp +++ b/algo/tests/unit-tests/testFilter.cpp @@ -385,6 +385,7 @@ void testExAequoKmersWhenSignificantParameter(){ seqtype seq; pair*, AbstractACAutomaton*>* p; string BIOREADER_EXAEQUO = "BioReader doesn't have ex-aequo"; + string SIZE_BIOREADER = "BioReader doesn't contain the good amount of sequences"; Sequence sequences[13]; sequences[0] = {"seq1-full_name", "seq-01*01", "AGCTAGCTA","", NULL, 0}; sequences[1] = {"seq1-full_name", "seq-01*02", "AGCTAGCTT", "", NULL, 0}; @@ -418,18 +419,45 @@ void testExAequoKmersWhenSignificantParameter(){ while(j < filtered.size() && extractGeneName(filtered.label(j)) != extractGeneName(testedBioReader.label(10))){ ++j; } + /* Check that filtered BioReader contains sequence n°11 which is the most present in the sequence. */ + int k = 0; + while(k < filtered.size() && extractGeneName(filtered.label(k)) != extractGeneName(testedBioReader.label(11))){ + ++k; + } + + /* Even though the filtered function got 2 as a parameter, since there are two ex-aequo the size is 3 */ + TAP_TEST(filtered.size() == 3, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER); TAP_TEST(i < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); TAP_TEST(j < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); + TAP_TEST(k < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); /* Add a third ex-aequo: k-mer belonging to sequence n°12 appearing 29 times */ seq += "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"; delete p->first; delete p->second; delete p; p = buildACAutomatonToFilterBioReader(testedBioReader, "####"); filtered = filterBioReaderWithACAutomaton(p, testedBioReader, seq, 2); - int k = 0; - while(k < filtered.size() && extractGeneName(filtered.label(k)) != extractGeneName(testedBioReader.label(12))){ + /* Check that filtered BioReader contains previous sequences and n°12 */ + i = 0; + while(i < filtered.size() && extractGeneName(filtered.label(i)) != extractGeneName(testedBioReader.label(5))){ + ++i; + } + j = 0; + while(j < filtered.size() && extractGeneName(filtered.label(j)) != extractGeneName(testedBioReader.label(10))){ + ++j; + } + k = 0; + while(k < filtered.size() && extractGeneName(filtered.label(k)) != extractGeneName(testedBioReader.label(11))){ ++k; } + int l = 0; + while(l < filtered.size() && extractGeneName(filtered.label(l)) != extractGeneName(testedBioReader.label(12))){ + ++l; + } + /* Even though the filtered function got 2 as a parameter, since there are three ex-aequo the size is 4 */ + TAP_TEST(filtered.size() == 4, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER); + TAP_TEST(i < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); + TAP_TEST(j < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); TAP_TEST(k < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); + TAP_TEST(l < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO); delete p->first; delete p->second; delete p; }