automaton.h 5.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#ifndef AUTOMATON_H
#define AUTOMATON_H

#include <list>
#include <cctype>
#include "kmerstore.h"
#include <cstdlib>
#include <queue>
#include <utility>
#include "tools.h"
11
#include <map>
12 13 14

using namespace std;

15 16 17
/* Max value for k (for storing the number of kmers of that size) */
#define MAX_KMER_SIZE 20

18 19 20 21 22 23 24 25 26
/**
 * This abstract class represents an Aho-Corasick automaton.
 * Each final state can store some information.
 */
template <class Info>
class AbstractACAutomaton: public IKmerStore<Info> {

protected:
  void *initialState;
27
  map<Info, size_t> kmers_inserted;
28
public:
29 30
  AbstractACAutomaton();

31 32 33 34 35 36
  /**
   * Builds the failure functions. This function must be called before any
   * query is made.
   */
  virtual void build_failure_functions() = 0;

37 38 39 40 41
  /**
   * @inherited from IKMerStore
   */
  void finish_building();

42 43 44 45 46
  /**
   * @inherited from IKMerStore
   */
  float getIndexLoad(Info kmer) const;

47 48 49 50 51
  /**
   * @inherited from IKMerStore
   */
  bool hasDifferentKmerTypes() const;

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
  /**
   * @return the information stored for this state
   */
  virtual list<Info> &getInfo(void *state) = 0;

  /**
   * @param starting_state: the starting state for traversing the automate.
   *                        if NULL starts from the initial state.
   * @return return the arrival state when starting from starting_state and
   * reading the string seq.
   */
  void *goto_state(const string &seq, void *starting_state=NULL);

  /**
   * @return true iff states is the initial state
   */
  bool isInitialState(void *state);

  /**
   * @return true iff states points to a final state
   */
  virtual bool isFinalState(void *state) = 0;

  /**
   * @param state: a pointer to the current state
   * @param c: the character to follow
   * @return the state accessible from state with character c.
   *         It will always return a valid pointer, at least the initial state.
   */
  virtual void *next(void *state, char c) = 0;

Mikael Salson's avatar
Mikael Salson committed
83 84 85 86 87 88 89
  /**
   * This function returns the number of times every Info appears in the
   * given sequence.
   * It returns a map containing the number of occurences per Info.
   * @param seq: The sequence to be queried. It is passed through 
   *             the automaton to identify matching k-mers and extract 
   *             the corresponding Info.
90
   * @param false: unused.
Mikael Salson's avatar
Mikael Salson committed
91 92 93 94
   * @param seed: unused.
   */
  virtual map<Info,int> getMultiResults
    (const seqtype &seq, bool no_revcomp=false, string seed = "") = 0;
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
};

#define DNA_ALPHABET_SIZE 4

typedef enum {A, C, G, T, N, NB_TRANSITIONS} nt_transition;

template <class Info>
class pointer_state {
public:
  pointer_state<Info> *transitions[NB_TRANSITIONS]; /* Transitions to the 5 nt */
  bool is_final;
  list<Info> informations;           /* != NULL when is_final */

  pointer_state():is_final(false),informations() {
    for (size_t i = 0; i < NB_TRANSITIONS; i++)
      transitions[i] = NULL;
    informations.push_back(Info());
  }

  pointer_state<Info> *transition(char c) {
    return transitions[nuc_to_int(c)];
  }
};

/**
 * PointerACAutomaton builds state which points to other states.
 * Each state stores at least one information (but possibly more).
 */
template <class Info>
class PointerACAutomaton: public AbstractACAutomaton<Info> {
private:
126 127
  bool multiple_info;

128
  void free_automaton(pointer_state<Info> *);
129
  void init(string seed, bool revcomp, bool multiple_info);
130
public:
131 132
  using IKmerStore<Info>::insert;

133 134
  /**
   * @param revcomp: should the revcomp of the sequences also be indexed
135
   * @param multiple_info: should all the Info be stored in the automaton or
136
   *                       only a single value summarizing them all.
137 138 139 140 141 142
   *
   * The default seed will be a contiguous seed of 10 letters.  But the seed
   * can be specified when inserting sequences. This should be the preferred
   * choice as one may want to have different seeds depending on the
   * sequences.
   */
143
  PointerACAutomaton(bool revcomp=false, bool multiple_info=false);
144 145 146 147

  /**
   * @param seed: the seed to be used for indexing
   * @param revcomp: indexing revcomp too ?
148
   * @param multiple_info: storing all info?
149
   */
150
  PointerACAutomaton(string seed, bool revcomp=false, bool multiple_info=false);
151 152 153 154

  /**
   * @param k: the size of the contiguous seed
   * @param revcomp: indexing revcomp too ?
155
   * @param multiple_info: storing all info?
156
   */
157
  PointerACAutomaton(int k, bool revcomp=false, bool multiple_info=false);
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

  ~PointerACAutomaton();

  void build_failure_functions();

  /**
   * @return the information stored for this state
   */
  list<Info> &getInfo(void *state);

  /**
   * @return the automaton initial state
   */
  pointer_state<Info> *getInitialState();

  pointer_state<Info> *goto_state(const string &seq, void *starting_state=NULL);

  /**
   * Insert the sequence in the automata
   */
  void insert(const seqtype &seq, Info info);

  /**
   * Insert all the possible kmers from the sequence, depending on the
   * provided seed (or the default seed if none is provided)
   */
  void insert(const seqtype &seq, const string &label,
              bool ignore_extended_nucleotides=true,
              int keep_only=0, string seed="");

  /**
   * @return true iff states points to a final state
   */
  bool isFinalState(void *state);

  /**
   * @param state: a pointer to the current state
   * @param c: the character to follow
   * @return the state accessible from state with character c.
   *         It will always return a valid pointer, at least the initial state.
   */
  void *next(void *state, char c);

  // From IKmerStore

203
  vector<Info> getResults(const seqtype &seq, bool no_revcomp=false, string seed = "");
204 205
 	 
  map<Info,int> getMultiResults(const seqtype &seq, bool no_revcomp=false, string seed = "");
206 207 208 209 210 211
  Info& get(seqtype &word) ;

   Info& operator[](seqtype& word);
};

#endif