affectanalyser.h 8.8 KB
Newer Older
1

Mikael Salson's avatar
Mikael Salson committed
2 3 4 5
#ifndef AFFECT_ANALYSER_H
#define AFFECT_ANALYSER_H

#include "kmerstore.h"
6
#include "kmeraffect.h"
Mikael Salson's avatar
Mikael Salson committed
7 8 9
#include <set>
#include <vector>
#include <cassert>
10
#include <map>
Mikael Salson's avatar
Mikael Salson committed
11 12 13 14 15 16 17 18 19

// Define two constant affectations: ambiguous and unknown.

/* Declaration of types */

typedef enum affect_options_e {
  AO_NONE, AO_NO_CONSECUTIVE, AO_NO_MULTIPLICITY
} affect_options_t;

20 21

/* Stores results during .getMaximum() computation */
22 23 24 25 26 27 28 29 30 31 32
typedef struct affect_infos_s {
  int first_pos_max;            /* First position of maximum */
  int last_pos_max;             /* Last position of maximum */
  int max_value;                /* Maximal value */
  int nb_before_right;          /* Nb of “before” right of the maximum */
  int nb_after_right;           /* Same with “after” */
  int nb_before_left;           /* Nb of “before” left of the maximum */
  int nb_after_left;            /* Same with “after” */
  bool max_found;               /* We have found a maximum */
} affect_infos;

33 34
bool operator==(const affect_infos &ai1, const affect_infos &ai2);

Mikael Salson's avatar
Mikael Salson committed
35 36 37 38 39 40 41 42 43 44 45
/**
 * Class that records for every k-mer of a given sequence
 * in which sequences this k-mer was also seen.
 * It can either record one affectation per kmer (the only sequence where it
 * occurs or ambiguous case if there are several possibilities, or 
 * unknown otherwise), or all the possible affectations (ie. there is no ambiguous
 * case, all the possibilities for one k-mer are stored).
 *
 * Input: Index that constitutes the k-mer sequence repertoire
 * Input: Sequence whose k-mers must be affected
 */
46

Mikael Salson's avatar
Mikael Salson committed
47 48 49
class AffectAnalyser {
 public:

50 51
  virtual ~AffectAnalyser() {}

Mikael Salson's avatar
Mikael Salson committed
52 53 54 55 56 57 58 59 60 61 62
  /* Queries */

  /**
   * @return the total number of affectations
   */
  virtual int count() const = 0;

  /**
   * @param affect: An affectation
   * @return the number of times this affectation has been given in the read.
   */
63
  virtual int count(const KmerAffect &affect) const = 0;
Mikael Salson's avatar
Mikael Salson committed
64 65 66 67 68 69

  /**
   * @param i: the position to consider
   * @pre i >= 0 && i < count()
   * @return the affectation of the k-mer at position i.
   */
70
  virtual const KmerAffect &getAffectation(int i)  const = 0;
Mikael Salson's avatar
Mikael Salson committed
71 72 73 74 75 76 77 78

  /**
   * @param options: options can either be AO_NONE or AO_NO_CONSECUTIVE 
   * @return all the affectations contained in the read from left to right.
   *         if AO_NO_CONSECUTIVE is given: two consecutive elements in the vector
   *                                     will be different (we remove consecutive
   *                                     duplicates)
   */
79
  virtual vector<KmerAffect> getAllAffectations(affect_options_t options) const = 0;
Mikael Salson's avatar
Mikael Salson committed
80 81 82 83

  /**
   * @return the distinct affectations
   */
84
  virtual set<KmerAffect> getDistinctAffectations() const = 0;
Mikael Salson's avatar
Mikael Salson committed
85 86 87 88 89 90 91 92 93 94 95 96 97

  /**
   * @return the sequence we are analysing
   */
  virtual const string &getSequence() const = 0;

  /**
   * @param affect: an affectation
   * @return the first occurrence of this affectation in the read
   *         or string::npos if the affectation was not found
   * @post getAffectation(first(affect)) == affect 
   * ==>  getAffectation(1...first(affect)-1) != affect
   */
98
  virtual int first(const KmerAffect &affect) const  = 0;
Mikael Salson's avatar
Mikael Salson committed
99 100 101 102 103 104 105 106

  /**
   * @param affect: an affectation
   * @return the last occurrence of this affectation in the read
   *         or string::npos if the affectation was not found
   * @post getAffectation(last(affect)) == affect 
   * ==> getAffectation(last(affect)+1 ... count() -1) != affect
   */
107
  virtual int last(const KmerAffect &affect) const  = 0;
Mikael Salson's avatar
Mikael Salson committed
108 109 110 111 112 113 114

  /**
   * @return a string representation of the object
   */
  virtual string toString() const  = 0;
};

115 116

class KmerAffectAnalyser: public AffectAnalyser {
117
 protected:
118
  IKmerStore<KmerAffect> &kms;
Mikael Salson's avatar
Mikael Salson committed
119
  const string &seq;
120
  vector<KmerAffect> affectations;
Mikael Salson's avatar
Mikael Salson committed
121 122 123 124 125 126
 public:
  /**
   * @param kms: the index storing the affectation for the k-mers
   *             (parameter is not copied)
   * @param seq: the sequence to analyse (parameter is not copied)
   */
127
  KmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq);
128 129 130 131 132 133 134 135 136 137

  /**
   * This constructor must be seen as a “toy” constructor, used for 
   * testing.
   * It allows to directly provide the affectation and therefore avoids
   * the need to search a long time for good example that could be tested.
   * @param kms: mainly used for retrieving the seed or its size
   * @param seq: basically not used in the class
   * @param a: the affectation we must use.
   */
138
  KmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq, vector<KmerAffect> a);
139

Mikael Salson's avatar
Mikael Salson committed
140 141 142 143
  ~KmerAffectAnalyser();

  int count() const;

144
  int count(const KmerAffect &affect) const;
Mikael Salson's avatar
Mikael Salson committed
145

146
  const KmerAffect &getAffectation(int i) const;
Mikael Salson's avatar
Mikael Salson committed
147

148
  vector<KmerAffect> getAllAffectations(affect_options_t options) const;
Mikael Salson's avatar
Mikael Salson committed
149

150
  set<KmerAffect> getDistinctAffectations() const;
Mikael Salson's avatar
Mikael Salson committed
151

152
  /**
153 154
   * @param maxOverlap: if greater than kms.getS(), it is automatically set
   *                    to that value.
155 156 157 158 159 160 161 162 163 164 165 166
   * @return A structure where the maximum is such that those positions
   *         maximise the number of affectations before, minus the number of
   *         affectations after the returned positions.
   *
   *         The maximum reached must be above max(0, total number of
   *         <before>) and such that the number of <before> after the
   *         rightmost max position is <ratioMin> times less than the number
   *         of <after> after that position. If no so much maximum is found,
   *         the boolean <max_found> is set to false in the structure.
   *
   * @complexity time: linear in count(), space: constant
   */
167
  affect_infos getMaximum(const KmerAffect &before, const KmerAffect &after, 
168
                          float ratioMin=2., 
169
                          int maxOverlap=1) const;
170

Mikael Salson's avatar
Mikael Salson committed
171 172
  const string &getSequence() const;

173
  int first(const KmerAffect &affect) const;
Mikael Salson's avatar
Mikael Salson committed
174

175
  int last(const KmerAffect &affect) const ;
Mikael Salson's avatar
Mikael Salson committed
176 177 178 179

  string toString() const;
};

180 181 182 183
/**
 * Class that allows to count in constant time the number of affectations
 * before or after a given point.
 */
184 185

class CountKmerAffectAnalyser: public KmerAffectAnalyser {
186
 private:
187
  map<KmerAffect, int* >counts;
188
  int overlap;
189 190
 public:

191
  CountKmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq);
192 193
  ~CountKmerAffectAnalyser();

194 195
  int count() const;

196 197 198
  /**
   * @complexity constant time
   */
199
  int count(const KmerAffect &affect) const;
200 201 202 203 204

  /**
   * Count the number of an affectation before (strictly) than a position
   * @complexity constant time
   */
205
  int countBefore(const KmerAffect &affect, int pos) const;
206 207 208 209 210

  /**
   * Count the number of an affectation after (strictly) than a position)
   * @complexity constant time
   */
211
  int countAfter(const KmerAffect &affect, int pos) const;
212 213 214

  /**
   * @return the first position pos in the sequence such that 
215
   *         countBefore(before, pos - s) 
216
             + countAfter(after, pos) is maximal
217
   *         and pos >= start, and the maximum is greater than min; 
218
   *         or -1 if such a position doesn't exist.
219
   *         Where s is kms.getS() - getAllowedOverlap() - 1.
220 221
   * @complexity linear in getSequence().size() 
   */
222
  int firstMax(const KmerAffect &before, const KmerAffect &after, int start=0, int min=-1) const;
223 224 225

  /**
   * @return the last position pos in the sequence such that
226
   *         countBefore(before, pos - s) 
227
   *         + countAfter(after, pos) is maximal
228 229
   *         and pos <= end (if end == -1 considers end of sequence), and the 
   *         maximum is greater than min; or -1 if such a position doesn't exist.
230
   *         Where s is kms.getS() - getAllowedOverlap() - 1.
231 232
   * @complexity linear in getSequence().size()
   */
233
  int lastMax(const KmerAffect &before, const KmerAffect &after, int end=-1, int min=-1) const;
234

235 236 237 238 239 240
  /**
   * @return the allowed overlap between two k-mers with distinct affectations
   * (default is 0)
   */
  int getAllowedOverlap();

241 242 243 244 245 246 247 248 249
  /**
   * @parameter forbidden: a set of forbidden affectations that must not 
   *                       be taken into account for the max computation.
   * @pre There must have at least one affectation that is not forbidden otherwise
   *      the returned value is the unknown affectation.
   * @return the affectation that is seen the most frequently in the sequence
   *         taken apart the forbidden ones.
   * @complexity Time: linear on the number of distinct affectations
   */
250
  KmerAffect max(const set<KmerAffect> forbidden = set<KmerAffect>()) const;
251

252 253 254 255 256 257 258
  /**
   * Set the overlap allowed between two k-mers with two different affectations,
   * when looking for the maximum.
   * The overlap should not be greater than the span of the seed used.
   */
  void setAllowedOverlap(int overlap);

259 260 261 262 263 264 265 266
 private:
  /**
   * Build the counts map.
   */
  void buildCounts();

  /**
   * Search the maximum. Used by firstMax and lastMax.
267
   * @param iter: should be either 1 or -1
268
   */
269
  int searchMax(const KmerAffect &before, const KmerAffect &after,
270
                int start, int end, int iter, int min) const;
271
};
Mikael Salson's avatar
Mikael Salson committed
272 273

#endif