automaton.h 4.75 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#ifndef AUTOMATON_H
#define AUTOMATON_H

#include <list>
#include <cctype>
#include "kmerstore.h"
#include <cstdlib>
#include <queue>
#include <utility>
#include "tools.h"
#include <set>

using namespace std;

15
16
17
/* Max value for k (for storing the number of kmers of that size) */
#define MAX_KMER_SIZE 20

18
19
20
21
22
23
24
25
26
/**
 * This abstract class represents an Aho-Corasick automaton.
 * Each final state can store some information.
 */
template <class Info>
class AbstractACAutomaton: public IKmerStore<Info> {

protected:
  void *initialState;
27
  map<Info, size_t> kmers_inserted;
28
public:
29
30
  AbstractACAutomaton();

31
32
33
34
35
36
  /**
   * Builds the failure functions. This function must be called before any
   * query is made.
   */
  virtual void build_failure_functions() = 0;

37
38
39
40
41
  /**
   * @inherited from IKMerStore
   */
  void finish_building();

42
43
44
45
46
  /**
   * @inherited from IKMerStore
   */
  float getIndexLoad(Info kmer) const;

47
48
49
50
51
  /**
   * @inherited from IKMerStore
   */
  bool hasDifferentKmerTypes() const;

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  /**
   * @return the information stored for this state
   */
  virtual list<Info> &getInfo(void *state) = 0;

  /**
   * @param starting_state: the starting state for traversing the automate.
   *                        if NULL starts from the initial state.
   * @return return the arrival state when starting from starting_state and
   * reading the string seq.
   */
  void *goto_state(const string &seq, void *starting_state=NULL);

  /**
   * @return true iff states is the initial state
   */
  bool isInitialState(void *state);

  /**
   * @return true iff states points to a final state
   */
  virtual bool isFinalState(void *state) = 0;

  /**
   * @param state: a pointer to the current state
   * @param c: the character to follow
   * @return the state accessible from state with character c.
   *         It will always return a valid pointer, at least the initial state.
   */
  virtual void *next(void *state, char c) = 0;

};

#define DNA_ALPHABET_SIZE 4

typedef enum {A, C, G, T, N, NB_TRANSITIONS} nt_transition;

template <class Info>
class pointer_state {
public:
  pointer_state<Info> *transitions[NB_TRANSITIONS]; /* Transitions to the 5 nt */
  bool is_final;
  list<Info> informations;           /* != NULL when is_final */

  pointer_state():is_final(false),informations() {
    for (size_t i = 0; i < NB_TRANSITIONS; i++)
      transitions[i] = NULL;
    informations.push_back(Info());
  }

  pointer_state<Info> *transition(char c) {
    return transitions[nuc_to_int(c)];
  }
};

/**
 * PointerACAutomaton builds state which points to other states.
 * Each state stores at least one information (but possibly more).
 */
template <class Info>
class PointerACAutomaton: public AbstractACAutomaton<Info> {
private:
114
  void free_automaton(pointer_state<Info> *);
115
116
  void init(string seed, bool revcomp);
public:
117
118
  using IKmerStore<Info>::insert;

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  /**
   * @param revcomp: should the revcomp of the sequences also be indexed
   *
   * The default seed will be a contiguous seed of 10 letters.  But the seed
   * can be specified when inserting sequences. This should be the preferred
   * choice as one may want to have different seeds depending on the
   * sequences.
   */
  PointerACAutomaton(bool revcomp=false);

  /**
   * @param seed: the seed to be used for indexing
   * @param revcomp: indexing revcomp too ?
   */
  PointerACAutomaton(string seed, bool revcomp=false);

  /**
   * @param k: the size of the contiguous seed
   * @param revcomp: indexing revcomp too ?
   */
  PointerACAutomaton(int k, bool revcomp=false);

  ~PointerACAutomaton();

  void build_failure_functions();

  /**
   * @return the information stored for this state
   */
  list<Info> &getInfo(void *state);

  /**
   * @return the automaton initial state
   */
  pointer_state<Info> *getInitialState();

  pointer_state<Info> *goto_state(const string &seq, void *starting_state=NULL);

  /**
   * Insert the sequence in the automata
   */
  void insert(const seqtype &seq, Info info);

  /**
   * Insert all the possible kmers from the sequence, depending on the
   * provided seed (or the default seed if none is provided)
   */
  void insert(const seqtype &seq, const string &label,
              bool ignore_extended_nucleotides=true,
              int keep_only=0, string seed="");

  /**
   * @return true iff states points to a final state
   */
  bool isFinalState(void *state);

  /**
   * @param state: a pointer to the current state
   * @param c: the character to follow
   * @return the state accessible from state with character c.
   *         It will always return a valid pointer, at least the initial state.
   */
  void *next(void *state, char c);

  // From IKmerStore

185
  vector<Info> getResults(const seqtype &seq, bool no_revcomp=false, string seed = "");
186
187
188
189
190
191
  Info& get(seqtype &word) ;

   Info& operator[](seqtype& word);
};

#endif