filter.cpp 3.82 KB
Newer Older
1
2
#include "filter.h"

3
4
5
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed){
  buildACAutomatonToFilterBioReader(origin, seed);
}
6
7
8
9
10

FilterWithACAutomaton::~FilterWithACAutomaton(){

}

11
12
13
14
15
16
17
18
19
20
21
22
23
  (BioReader &origin, string seed){
  pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* result;
  vector<int>* indexes;
  PointerACAutomaton<KmerAffect>* aho;
  char asciiChar;
  int asciiNumber;
  string currentLabel;
  string previousLabel;

  if(origin.size() < 1){
    return nullptr;
  }
  result = new pair<vector<int>*,AbstractACAutomaton<KmerAffect>*>();
Cyprien Borée's avatar
Cyprien Borée committed
24
  aho = new PointerACAutomaton<KmerAffect>(seed, false, true);
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  indexes = new vector<int>();
  aho->insert(origin.sequence(0),std::string("") + char(1), true, 0, seed);
  asciiNumber = 1;
  indexes->push_back(0);
  previousLabel = extractGeneName(origin.label(0));
  int i;
  for(i = 1;i < origin.size(); ++i){
    currentLabel = extractGeneName(origin.label(i));
    if(currentLabel != previousLabel){
      indexes->push_back(i);
      asciiNumber++;
    }
    if(asciiNumber > 127){
      delete result; delete aho; delete indexes;
      return nullptr;
    }
    asciiChar = char(asciiNumber);
    aho->insert(origin.sequence(i),std::string("") + asciiChar, true, 0, seed);
    previousLabel = currentLabel;
  }
  indexes->push_back(origin.size());
  aho->build_failure_functions();
  result->first = indexes;
  result->second = aho;
  return result;
}

/*
  Takes a built automaton and a vector of indexes and build a BioReader
  based on it.
*/
56
BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
57
58
    pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* idxAho,
    BioReader &origin, seqtype &seq,
59
    int kmer_threshold){
60
61
62
63
64
65
66
67

  BioReader result;
  AbstractACAutomaton<KmerAffect>* aho;
  vector<int>* indexes;
  map<KmerAffect, int> mapAho;
  KmerAffect tmpKmer;
  unsigned int asciiNum;
  char asciiChar;
68
  if(!idxAho || kmer_threshold < 0){
69
70
71
72
73
74
75
    return origin;
  }
  indexes =  idxAho->first;
  aho = idxAho->second;
  mapAho = aho->getMultiResults(seq);

  //All k-mers selected : iterate over all map
76
  if(kmer_threshold == ALL_KMERS_VALUE || kmer_threshold > (int)mapAho.size()){
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    for(auto const mx: mapAho){
      tmpKmer = mx.first;
      asciiChar = tmpKmer.getLabel().at(0);
      asciiNum = int(asciiChar);
      if(asciiNum > indexes->size() - 1){
        break;
      }
      for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
        result.add(origin.read(i));
      }
    }
  /* The most significant k-mers selected : iterate over a portion of the
    sorted map */
  }else{
    /* sort map */
    typedef function<bool(pair<KmerAffect, int>, pair<KmerAffect, int>)> Comparator;
    Comparator compFunctor = [](pair<KmerAffect, int> elem1 ,pair<KmerAffect, int> elem2){
94
      return (elem1.second == elem2.second) ? elem1.first > elem2.first : elem1.second > elem2.second;
95
96
97
98
    };
    // Use a set to use the comparator and sort function
    set<pair<KmerAffect, int>, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor);
    // Iterate over the pair and not the map
99
    int nbKmers = 0, previousOccurences = 0;
100
101
    for(pair<KmerAffect, int> element : setOfWords){
      // Add corresponding sequences to the BioReader
102
        if(nbKmers == kmer_threshold && previousOccurences == element.second){
103
104
105
106
107
          //Keep the same amount of genes
        }else if(nbKmers < kmer_threshold){
          nbKmers++;
        }else{
          break;
108
        }
109
110
111
112
113
114
115
116
117
        tmpKmer = element.first;
        asciiChar = tmpKmer.getLabel().at(0);
        asciiNum = int(asciiChar);
        if(asciiNum > indexes->size() - 1){
          break;
        }
        for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
          result.add(origin.read(i));
        }
118
        previousOccurences = element.second;
119
120
    }
  }
121
  return (result.size() == 0) ? origin : result;
122
}