Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

Commit 7cf92edc authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

Merge branch...

Merge branch 'feature-a/3282-repetitive-filterbioreaderwithacautomaton-parts-to-reusable-function' into 'dev'

Feature a/3282 repetitive filterbioreaderwithacautomaton parts to reusable function

Closes #3282, #3312, and #3299

See merge request !228
parents df17a08c 1719e236
Pipeline #31554 failed with stages
in 6 minutes and 14 seconds
......@@ -27,12 +27,11 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader
}
automaton = new PointerACAutomaton<KmerAffect>(seed, false, true);
indexes = new vector<int>();
automaton->insert(origin.sequence(0),std::string("") + char(1), true, 0, seed);
asciiNumber = 1;
asciiNumber = SPECIFIC_KMERS_NUMBER;
automaton->insert(origin.sequence(0),std::string("") + char(asciiNumber), true, 0, seed);
indexes->push_back(0);
previousLabel = extractGeneName(origin.label(0));
int i;
for(i = 1;i < origin.size(); ++i){
for(int i = 1;i < origin.size(); ++i){
currentLabel = extractGeneName(origin.label(i));
if(currentLabel != previousLabel){
indexes->push_back(i);
......@@ -62,9 +61,6 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
BioReader result;
map<KmerAffect, int> mapAho;
KmerAffect tmpKmer;
unsigned int asciiNum;
char asciiChar;
if(!automaton || !indexes || kmer_threshold < 0){
return origin;
}
......@@ -73,14 +69,8 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
//All k-mers selected : iterate over all map
if(kmer_threshold == ALL_KMERS_VALUE || kmer_threshold > (int)mapAho.size()){
for(auto const mx: mapAho){
tmpKmer = mx.first;
asciiChar = tmpKmer.getLabel().at(0);
asciiNum = int(asciiChar);
if(asciiNum > indexes->size() - 1){
break;
}
for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
result.add(origin.read(i));
if(mx.first.isGeneric()){
transferBioReaderSequences(origin, result, mx.first);
}
}
/* The most significant k-mers selected : iterate over a portion of the
......@@ -97,6 +87,9 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
int nbKmers = 0, previousOccurences = 0;
for(pair<KmerAffect, int> element : setOfWords){
// Add corresponding sequences to the BioReader
if(!element.first.isGeneric()){
continue;
}
if(nbKmers == kmer_threshold && previousOccurences == element.second){
//Keep the same amount of genes
}else if(nbKmers < kmer_threshold){
......@@ -104,21 +97,25 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
}else{
break;
}
tmpKmer = element.first;
asciiChar = tmpKmer.getLabel().at(0);
asciiNum = int(asciiChar);
if(asciiNum > indexes->size() - 1){
break;
}
for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
result.add(origin.read(i));
}
transferBioReaderSequences(origin, result, element.first);
previousOccurences = element.second;
}
}
return (result.size() == 0) ? origin : result;
}
void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, BioReader &dst, KmerAffect k) const{
char asciiChar = k.getLabel().at(0);
unsigned int asciiNum = int(asciiChar);
if(asciiNum > indexes->size() || !k.isGeneric()){
throw invalid_argument("Incorrect K-mer transmitted.");
}
for(int i = indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1); ++i){
dst.add(src.read(i));
}
}
vector<int>* FilterWithACAutomaton::getIndexes() const{
return this->indexes;
}
......
......@@ -92,5 +92,16 @@ class FilterWithACAutomaton {
* Return the automaton stored.
*/
AbstractACAutomaton<KmerAffect>* getAutomaton() const;
/**
* Transfer sequences from a BioReader to another.
* @param src The BioReader from where the transfer will operate.
* @param dst The BioReader that will receive the new sequences.
* @param k The K-mer that indicate which sequences will be transfered.
* The label stored in the K-mer is used to select sequences. For more informations
* about how the label is used, see buildACAutomatonToFilterBioReader's doc.
*/
void transferBioReaderSequences(const BioReader &src, BioReader &dst, const KmerAffect k) const;
};
#endif
......@@ -178,6 +178,10 @@ bool KmerAffect::isUnknown() const {
return affect.c == (int) AFFECT_UNKNOWN_CHAR;
}
bool KmerAffect::isGeneric() const {
return !(isUnknown() || isAmbiguous());
}
string KmerAffect::toString() const {
return ::toString(affect);
}
......
......@@ -152,6 +152,11 @@ public:
* @return true iff the value is the same as the one given by default constructor
*/
bool isNull() const;
/**
* @return true if the K-mer is not odd (ambiguous or unknown)
*/
bool isGeneric() const;
string toString() const;
string toStringValues()const;
......@@ -184,6 +189,9 @@ ostream &operator<<(ostream &os, const KmerAffect &kmer);
/* Define how an unknown kmeraffect looks like in a string */
#define AFFECT_UNKNOWN_TO_STRING "_"
/* Define how meny specific k-mers exist. For now there is only ambiguous and unknown. */
#define SPECIFIC_KMERS_NUMBER 2
/**
* Constant defining any not-unknown affectation
* Could be used by .getIndexLoad(), but now any non-AFFECT_UNKNOWN kmer will work.
......
......@@ -221,33 +221,33 @@ void testAutomatonBuilderFilteringBioReader(){
}
/* test automaton KmerAffect label */
for(unsigned int i = 0;i < expectedIndexes1.size() - 1; ++i){
for(unsigned int i = 0, l = SPECIFIC_KMERS_NUMBER;i < expectedIndexes1.size() - 1; ++i, ++l){
for(int j = expectedIndexes1[i]; j < expectedIndexes1[i + 1]; ++j){
seq = testedBioReader1.sequence(j);
k = a1->get(seq);
asciiChar = k.getLabel().at(0);
asciiNum = int(asciiChar);
TAP_TEST_EQUAL(asciiNum, i + 1, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TAP_TEST_EQUAL(asciiNum, l, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TEST_LABEL_ERROR);
}
}
for(unsigned int i = 0;i < expectedIndexes2.size() - 1; ++i){
for(unsigned int i = 0, l = SPECIFIC_KMERS_NUMBER;i < expectedIndexes2.size() - 1; ++i, ++l){
for(int j = expectedIndexes2[i]; j < expectedIndexes2[i + 1]; ++j){
seq = testedBioReader2.sequence(j);
k = a2->get(seq);
asciiChar = k.getLabel().at(0);
asciiNum = int(asciiChar);
TAP_TEST_EQUAL(asciiNum, i + 1, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TAP_TEST_EQUAL(asciiNum, l, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TEST_LABEL_ERROR);
}
}
for(unsigned int i = 0;i < expectedIndexes3.size() - 1; ++i){
for(unsigned int i = 0, l = SPECIFIC_KMERS_NUMBER; i < expectedIndexes3.size() - 1; ++i, ++l){
for(int j = expectedIndexes3[i]; j < expectedIndexes3[i + 1]; ++j){
seq = testedBioReader3.sequence(j);
k = a3->get(seq);
asciiChar = k.getLabel().at(0);
asciiNum = int(asciiChar);
TAP_TEST_EQUAL(asciiNum, i + 1, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TAP_TEST_EQUAL(asciiNum, l, TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TEST_LABEL_ERROR);
}
}
......@@ -305,11 +305,11 @@ void testFilterBioReaderWithACAutomaton(){
list<Sequence> l1 = filteredBioReader1.getAll();
for(auto const m : m1){
KmerAffect tmpKmer = m.first;
if(tmpKmer.isNull() || tmpKmer.isUnknown() || tmpKmer.isAmbiguous()){
if(!tmpKmer.isGeneric()){
continue;
}
unsigned int asciiNumber = int(tmpKmer.getLabel().at(0));
for(int i = v1->at(asciiNumber-1); i < v1->at(asciiNumber); ++i){
for(int i = v1->at(asciiNumber-SPECIFIC_KMERS_NUMBER); i < v1->at(asciiNumber-SPECIFIC_KMERS_NUMBER + 1); ++i){
TAP_TEST(find(l1.begin(), l1.end(), testedBioReader1.read(i)) != l1.end(),
TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, GENES_ERROR);
}
......@@ -318,11 +318,11 @@ void testFilterBioReaderWithACAutomaton(){
list<Sequence> l2 = filteredBioReader2.getAll();
for(auto const m : m2){
KmerAffect tmpKmer = m.first;
if(tmpKmer.isNull() || tmpKmer.isUnknown() || tmpKmer.isAmbiguous()){
if(!tmpKmer.isGeneric()){
continue;
}
unsigned int asciiNumber = int(tmpKmer.getLabel().at(0));
for(int i = v2->at(asciiNumber-1); i < v2->at(asciiNumber); ++i){
for(int i = v2->at(asciiNumber-SPECIFIC_KMERS_NUMBER); i < v2->at(asciiNumber-SPECIFIC_KMERS_NUMBER + 1); ++i){
TAP_TEST(find(l2.begin(), l2.end(), testedBioReader2.read(i)) != l2.end(),
TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, GENES_ERROR);
}
......@@ -331,11 +331,11 @@ void testFilterBioReaderWithACAutomaton(){
list<Sequence> l3 = filteredBioReader3.getAll();
for(auto const m : m3){
KmerAffect tmpKmer = m.first;
if(tmpKmer.isNull() || tmpKmer.isUnknown() || tmpKmer.isAmbiguous()){
if(!tmpKmer.isGeneric()){
continue;
}
unsigned int asciiNumber = int(tmpKmer.getLabel().at(0));
for(int i = v3->at(asciiNumber-1); i < v3->at(asciiNumber); ++i){
for(int i = v3->at(asciiNumber-SPECIFIC_KMERS_NUMBER); i < v3->at(asciiNumber-SPECIFIC_KMERS_NUMBER + 1); ++i){
TAP_TEST(find(l3.begin(), l3.end(), testedBioReader3.read(i)) != l3.end(),
TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, GENES_ERROR);
}
......@@ -346,24 +346,15 @@ void testFilterBioReaderWithACAutomaton(){
void testGetNSignicativeKmers(){
BioReader filtered;
BioReader seqV("../../germline/homo-sapiens/IGHV.fa", 2);
BioReader seqD("../../germline/homo-sapiens/IGHD.fa", 2);
BioReader seqJ("../../germline/homo-sapiens/IGHJ.fa", 2);
OnlineFasta data("data/Stanford_S22.fasta", 1, " ");
data.next();
data.next();
Germline germline("IGH", 'G', seqV, seqD, seqJ, "########", 0, true);
germline.new_index(KMER_INDEX);
germline.finish();
string SIZE_ERROR = "Filtered size must be less than original one";
string GENE_NOT_FOUND = "Filtering sequence not found after filter";
for(int i = 0; i < germline.rep_5.size(); ++i){
Sequence seq = germline.rep_5.read(i);
FilterWithACAutomaton *f = germline.getFilter_5();
filtered = f->filterBioReaderWithACAutomaton(germline.rep_5, seq.sequence, 1);
for(int i = 0; i < seqV.size(); ++i){
Sequence seq = seqV.read(i);
FilterWithACAutomaton *f = new FilterWithACAutomaton(seqV, "########");
filtered = f->filterBioReaderWithACAutomaton(seqV, seq.sequence, 1);
delete f;
int j = 0;
while(j < filtered.size()){
if(extractGeneName(filtered.label(j)) == extractGeneName(seq.label)){
......@@ -372,7 +363,7 @@ void testGetNSignicativeKmers(){
++j;
}
TAP_TEST(j < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, GENE_NOT_FOUND);
TAP_TEST(filtered.size() < germline.rep_5.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_ERROR);
TAP_TEST(filtered.size() < seqV.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_ERROR);
}
}
......@@ -432,7 +423,7 @@ void testExAequoKmersWhenSignificantParameter(){
}
/* Even though the filtered function got 2 as a parameter, since there are two ex-aequo the size is 3 */
TAP_TEST(filtered.size() == 3, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER);
TAP_TEST_EQUAL(filtered.size(), 3, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER);
TAP_TEST(i < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
TAP_TEST(j < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
TAP_TEST(k < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
......@@ -450,7 +441,7 @@ void testExAequoKmersWhenSignificantParameter(){
++l;
}
/* Even though the filtered function got 2 as a parameter, since there are three ex-aequo the size is 4 */
TAP_TEST(filtered.size() == 4, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER);
TAP_TEST_EQUAL(filtered.size(), 4, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, SIZE_BIOREADER);
TAP_TEST(i < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
TAP_TEST(j < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
TAP_TEST(k < filtered.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, BIOREADER_EXAEQUO);
......@@ -470,10 +461,156 @@ void testBehaviourWhenHugeBioReader(){
delete f;
}
/* Test the good behaviour of Filter's transferBioReaderSequences function. */
void testTransferBioReaderSequences(){
affect_t affect;
KmerAffect *kmer;
BioReader res, testedBioReader1;
FilterWithACAutomaton *f;
bool caught = false;
const string ERROR_NO_EXCEPTION_THROWN = "The function must throw an exception when invalid K-mer is transmitted.";
const string ERROR_NON_EMPTY_BIOREADER = "The BioReader shouldn't contain any sequences.";
const string ERROR_INCORRECT_BIOREADER = "The BioReader doesn't have the correct number of sequences.";
testedBioReader1 = getDebugBioReader1();
f = new FilterWithACAutomaton(testedBioReader1, "####");
affect.length = 1;
/* When k-mer's label has a n°ascii over 127, the transfer should not operate. */
affect.c = char(128);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
}catch(...){
caught = true;
}
TAP_TEST(caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NO_EXCEPTION_THROWN);
TAP_TEST_EQUAL(res.size(), 0, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
delete kmer;
/* When k-mer's label has a n°ascii above the number of genes contained in the BioReader, the transfer should not operate. */
affect.c = char(8);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NO_EXCEPTION_THROWN);
TAP_TEST_EQUAL(res.size(), 0, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
delete kmer;
/* When k-mer's label has the n°ascii 0, the transfer should not operate since it's an ambiguous k-mer. */
affect.c = AFFECT_AMBIGUOUS_CHAR;
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
delete kmer;
TAP_TEST(caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 0, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
/* When k-mer's label has a n°ascii 1, the transfer should not operate since it's an unknown k-mer. */
affect.c = AFFECT_UNKNOWN_CHAR;
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 0, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
delete kmer;
/* With an ascii n°2, the functions should take only the 3 first sequences. */
affect.c = char(2);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 3, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
/* With an ascii n°3, the functions should contain the 3 previous sequences and 2 more. */
affect.c = char(3);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 5, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
/* With an ascii n°4, the functions should contain the 5 previous sequences and 1 more. */
affect.c = char(4);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 6, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
/* With an ascii n°5, the functions should contain the 6 previous sequences and 4 more. */
affect.c = char(5);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 10, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
/* With an ascii n°6, the functions should contain the 10 previous sequences and 1 more. */
affect.c = char(6);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), 11, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
/* With an ascii n°7, the functions should contain the 11 previous sequences and 2 more, wich is the same as the original BioReader. */
affect.c = char(7);
kmer = new KmerAffect(affect);
try{
f->transferBioReaderSequences(testedBioReader1, res, *kmer);
caught = false;
}catch(...){
caught = true;
}
TAP_TEST(!caught, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_NON_EMPTY_BIOREADER);
TAP_TEST_EQUAL(res.size(), testedBioReader1.size(), TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, ERROR_INCORRECT_BIOREADER);
delete kmer;
delete f;
}
void testFilter(){
testAutomatonBuilderFilteringBioReader();
testFilterBioReaderWithACAutomaton();
testBehaviourWhenHugeBioReader();
testGetNSignicativeKmers();
testExAequoKmersWhenSignificantParameter();
testTransferBioReaderSequences();
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment