filter.cpp 7.16 KB
Newer Older
1
#include "filter.h"
2
#include "math.hpp"
3

4
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed, float keys_compress) : originalBioReader(origin){
5 6
  this->filtered_sequences_nb = 0;
  this->filtered_sequences_calls = 0;
7
  buildACAutomatonToFilterBioReader(seed, keys_compress);
8
}
9 10

FilterWithACAutomaton::~FilterWithACAutomaton(){
11 12
    if(automaton){
      delete automaton;
13
    }
14 15
    if(indexes){
      delete indexes;
16
    }
17 18
}

19
void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed, float keys_compress){
20 21 22 23 24
  char asciiChar;
  int asciiNumber;
  string currentLabel;
  string previousLabel;

25
  if(originalBioReader.size() < 1){
26 27
    automaton = nullptr;
    indexes = nullptr;
28
    return;
29
  }
30
  automaton = new PointerACAutomaton<KmerAffect>(seed, false, true);
31
  indexes = new vector<int>();
32
  asciiNumber = SPECIFIC_KMERS_NUMBER;
33
  automaton->insert(originalBioReader.sequence(0),std::string("") + char(asciiNumber), true, 0, seed);
34
  indexes->push_back(0);
35 36 37 38

  int previousAsciiNumber = asciiNumber;
  int rawNumber = 0;

39 40 41
  previousLabel = extractGeneName(originalBioReader.label(0));
  for(int i = 1;i < originalBioReader.size(); ++i){
    currentLabel = extractGeneName(originalBioReader.label(i));
42
    if(currentLabel != previousLabel){
43 44 45 46 47 48
      asciiNumber = SPECIFIC_KMERS_NUMBER + 1 + (int) rawNumber / keys_compress;
      rawNumber++;
    }

    if (asciiNumber > previousAsciiNumber)
    {
49
      indexes->push_back(i);
50
      previousAsciiNumber = asciiNumber;
51 52
    }
    if(asciiNumber > 127){
53
      cerr << WARNING_STRING << "Pre-filtering disabled" << endl;
54 55 56
      delete automaton; delete indexes;
      automaton = nullptr;
      indexes = nullptr;
57
      return;
58 59
    }
    asciiChar = char(asciiNumber);
60
    automaton->insert(originalBioReader.sequence(i),std::string("") + asciiChar, true, 0, seed);
61 62
    previousLabel = currentLabel;
  }
63
  indexes->push_back(originalBioReader.size());
64
  automaton->build_failure_functions();
65 66 67 68 69 70
}

/*
  Takes a built automaton and a vector of indexes and build a BioReader
  based on it.
*/
71
BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
72
    seqtype &seq, int kmer_threshold, int pvalue){
73 74 75

  BioReader result;
  map<KmerAffect, int> mapAho;
76
  this->filtered_sequences_calls += 1;
77
  if(!automaton || !indexes || kmer_threshold < 0){
78 79
    this->filtered_sequences_nb += originalBioReader.size();
    return originalBioReader;
80
  }
81
  mapAho = automaton->getMultiResults(seq);
82

83 84
  #ifdef DEBUG_FILTER /* Display the number of k-mers found for each genes. */
  int currentAsciiNumber;
Cyprien Borée's avatar
Cyprien Borée committed
85 86 87 88 89
  for(auto const mx: mapAho){
    string previousLabel = "", currentLabel;
    currentAsciiNumber = SPECIFIC_KMERS_NUMBER;
    previousLabel = extractGeneName(originalBioReader.label(0));
    for(int i = 1;i < originalBioReader.size(); ++i){
90 91 92 93 94 95 96 97 98 99 100 101
      currentLabel = extractGeneName(originalBioReader.label(i));
      if(currentLabel != previousLabel){
        currentAsciiNumber++;
      }
      if(currentAsciiNumber == int(mx.first.getLabel().at(0))){
        cout << mx.second << " kmers found for " << originalBioReader.label(i) << endl;
      }
      previousLabel = currentLabel;
    }
  }
  #endif  

102
  //All k-mers selected : iterate over all map
103
  if(kmer_threshold == ALL_KMERS_VALUE || kmer_threshold > (int)mapAho.size()){
104
    for(auto const mx: mapAho){
105
      if(mx.first.isGeneric()){
106
        transferBioReaderSequences(originalBioReader, result, mx.first);
107 108 109 110 111 112
      }
    }
  /* The most significant k-mers selected : iterate over a portion of the
    sorted map */
  }else{
    /* sort map */
113
    using Comparator = bool (*) (pair<KmerAffect, int>, pair<KmerAffect, int>);
114
    Comparator compFunctor = [](pair<KmerAffect, int> elem1 ,pair<KmerAffect, int> elem2){
115
      return (elem1.second == elem2.second) ? elem1.first > elem2.first : elem1.second > elem2.second;
116 117 118 119
    };
    // Use a set to use the comparator and sort function
    set<pair<KmerAffect, int>, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor);
    // Iterate over the pair and not the map
120
    int nbKmers = 0;
121 122
    int nb_kmers_limit = -1;    // Limit number of kmers, defined when the last gene of interest is reached
    
123 124
    for(pair<KmerAffect, int> element : setOfWords){
      // Add corresponding sequences to the BioReader
125 126 127
        if(!element.first.isGeneric()){
          continue;
        }
128
        if(nbKmers == kmer_threshold && nb_kmers_limit <= element.second){
129 130 131
          // We have reached our limit of number of genes recovered but we
          // continue taking sequences are they have a similar number of
          // matching k-mers.
132 133 134 135
        }else if(nbKmers < kmer_threshold){
          nbKmers++;
        }else{
          break;
136
        }
137
        transferBioReaderSequences(originalBioReader, result, element.first);
138 139
        if (nbKmers == kmer_threshold && nb_kmers_limit == -1) {
          int maxlen = getSizeLongestTransferredSequence(result, element.first);
140
          nb_kmers_limit = compute_nb_kmers_limit(element.first.getLength(), element.second, maxlen, pvalue);
141 142 143 144 145
          if (nb_kmers_limit == 0) {
            this->filtered_sequences_nb += originalBioReader.size();
            return originalBioReader;
          }
        }
146 147
    }
  }
148 149
  this->filtered_sequences_nb += (result.size () == 0) ? originalBioReader.size() : result.size();
  return (result.size() == 0) ? originalBioReader : result;
150
}
151

152 153 154 155 156 157 158 159 160 161 162 163
void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, BioReader &dst, KmerAffect k) const{
  char asciiChar = k.getLabel().at(0);
  unsigned int asciiNum = int(asciiChar);

  if(asciiNum > indexes->size() || !k.isGeneric()){
    throw invalid_argument("Incorrect K-mer transmitted.");
  }
  for(int i = indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1); ++i){
    dst.add(src.read(i));
  }
}

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
int FilterWithACAutomaton::getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const{
  char asciiChar = k.getLabel().at(0);
  unsigned int asciiNum = int(asciiChar);

  if(asciiNum > indexes->size() || !k.isGeneric()){
    throw invalid_argument("Incorrect K-mer transmitted.");
  }

  size_t longest = 0;
  for(int i = 0; i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1) - indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); ++i){
    if (longest < reader.sequence(reader.size() - i - 1).length())
      longest = reader.sequence(reader.size() - i - 1).length();
  }
  return longest;
}

180 181 182 183 184 185
vector<int>* FilterWithACAutomaton::getIndexes() const{
  return this->indexes;
}

AbstractACAutomaton<KmerAffect>* FilterWithACAutomaton::getAutomaton() const{
  return this->automaton;
186
}
187 188 189 190 191 192 193

ostream &operator<<(ostream &out, const FilterWithACAutomaton& obj){
  int origin_bioreader_size = obj.originalBioReader.size();
  int total_sequences_filtered = obj.filtered_sequences_nb;
  int total_filtered_calls = obj.filtered_sequences_calls;
  int total_sequences_origin = total_filtered_calls * origin_bioreader_size;
  float aligned_rate = ((float)total_sequences_filtered/(float)total_sequences_origin) * 100;
194

195 196
  out << right
      << fixed << setw(8) << total_sequences_filtered << "/"
197 198 199 200 201
      << fixed << setw(8) << total_sequences_origin << "    "
      << fixed << setprecision(1) << setw(6) << aligned_rate << "%"
      << endl ;

  return out ;
202
}