affectanalyser.h 10.7 KB
Newer Older
1

Mikael Salson's avatar
Mikael Salson committed
2 3 4 5
#ifndef AFFECT_ANALYSER_H
#define AFFECT_ANALYSER_H

#include "kmerstore.h"
6
#include "kmeraffect.h"
Mikael Salson's avatar
Mikael Salson committed
7 8 9
#include <set>
#include <vector>
#include <cassert>
10
#include <map>
11 12
#include <iostream>
#include <iomanip>
Mikael Salson's avatar
Mikael Salson committed
13

14 15
#define NO_MINIMIZING_POSITION -1

Mikael Salson's avatar
Mikael Salson committed
16 17 18 19 20 21 22 23
// Define two constant affectations: ambiguous and unknown.

/* Declaration of types */

typedef enum affect_options_e {
  AO_NONE, AO_NO_CONSECUTIVE, AO_NO_MULTIPLICITY
} affect_options_t;

24

25 26 27 28
/*
   Stores results during .getMaximum() computation
   See examples in tests/unit-tests/testAffectAnalyser.cpp:testGetMaximum()
 */
29
typedef struct affect_infos_s {
30 31

  // Position of the plateau
32 33
  int first_pos_max;            /* First position of maximum */
  int last_pos_max;             /* Last position of maximum */
34 35

  // Value on the plateau
36
  int max_value;                /* Maximal value */
37 38

  // Number of complete affectations at both sides of the plateau
39 40 41 42
  int nb_before_right;          /* Nb of “before” right of the maximum */
  int nb_after_right;           /* Same with “after” */
  int nb_before_left;           /* Nb of “before” left of the maximum */
  int nb_after_left;            /* Same with “after” */
43

44 45 46
  bool max_found;               /* We have found a maximum */
} affect_infos;

47
bool operator==(const affect_infos &ai1, const affect_infos &ai2);
48
ostream &operator<<(ostream &out, const affect_infos &a);
49

Mikael Salson's avatar
Mikael Salson committed
50 51 52 53 54 55 56 57 58 59 60
/**
 * Class that records for every k-mer of a given sequence
 * in which sequences this k-mer was also seen.
 * It can either record one affectation per kmer (the only sequence where it
 * occurs or ambiguous case if there are several possibilities, or 
 * unknown otherwise), or all the possible affectations (ie. there is no ambiguous
 * case, all the possibilities for one k-mer are stored).
 *
 * Input: Index that constitutes the k-mer sequence repertoire
 * Input: Sequence whose k-mers must be affected
 */
61

Mikael Salson's avatar
Mikael Salson committed
62 63 64
class AffectAnalyser {
 public:

65 66
  virtual ~AffectAnalyser() {}

Mikael Salson's avatar
Mikael Salson committed
67 68 69 70 71 72 73 74 75 76 77
  /* Queries */

  /**
   * @return the total number of affectations
   */
  virtual int count() const = 0;

  /**
   * @param affect: An affectation
   * @return the number of times this affectation has been given in the read.
   */
78
  virtual int count(const KmerAffect &affect) const = 0;
Mikael Salson's avatar
Mikael Salson committed
79 80 81 82 83 84

  /**
   * @param i: the position to consider
   * @pre i >= 0 && i < count()
   * @return the affectation of the k-mer at position i.
   */
85
  virtual const KmerAffect &getAffectation(int i)  const = 0;
Mikael Salson's avatar
Mikael Salson committed
86 87 88 89 90 91 92 93

  /**
   * @param options: options can either be AO_NONE or AO_NO_CONSECUTIVE 
   * @return all the affectations contained in the read from left to right.
   *         if AO_NO_CONSECUTIVE is given: two consecutive elements in the vector
   *                                     will be different (we remove consecutive
   *                                     duplicates)
   */
94
  virtual vector<KmerAffect> getAllAffectations(affect_options_t options) const = 0;
Mikael Salson's avatar
Mikael Salson committed
95 96 97 98

  /**
   * @return the distinct affectations
   */
99
  virtual set<KmerAffect> getDistinctAffectations() const = 0;
Mikael Salson's avatar
Mikael Salson committed
100 101 102 103 104 105 106 107 108 109 110 111 112

  /**
   * @return the sequence we are analysing
   */
  virtual const string &getSequence() const = 0;

  /**
   * @param affect: an affectation
   * @return the first occurrence of this affectation in the read
   *         or string::npos if the affectation was not found
   * @post getAffectation(first(affect)) == affect 
   * ==>  getAffectation(1...first(affect)-1) != affect
   */
113
  virtual int first(const KmerAffect &affect) const  = 0;
Mikael Salson's avatar
Mikael Salson committed
114 115 116 117 118 119 120 121

  /**
   * @param affect: an affectation
   * @return the last occurrence of this affectation in the read
   *         or string::npos if the affectation was not found
   * @post getAffectation(last(affect)) == affect 
   * ==> getAffectation(last(affect)+1 ... count() -1) != affect
   */
122
  virtual int last(const KmerAffect &affect) const  = 0;
Mikael Salson's avatar
Mikael Salson committed
123 124 125 126 127 128 129

  /**
   * @return a string representation of the object
   */
  virtual string toString() const  = 0;
};

130 131

class KmerAffectAnalyser: public AffectAnalyser {
132
 protected:
133
  IKmerStore<KmerAffect> &kms;
Mikael Salson's avatar
Mikael Salson committed
134
  const string &seq;
135
  vector<KmerAffect> affectations;
136
  double left_evalue, right_evalue;
137

Mikael Salson's avatar
Mikael Salson committed
138 139 140 141 142 143
 public:
  /**
   * @param kms: the index storing the affectation for the k-mers
   *             (parameter is not copied)
   * @param seq: the sequence to analyse (parameter is not copied)
   */
144
  KmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq);
145 146 147 148 149 150 151 152 153 154

  /**
   * This constructor must be seen as a “toy” constructor, used for 
   * testing.
   * It allows to directly provide the affectation and therefore avoids
   * the need to search a long time for good example that could be tested.
   * @param kms: mainly used for retrieving the seed or its size
   * @param seq: basically not used in the class
   * @param a: the affectation we must use.
   */
155
  KmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq, vector<KmerAffect> a);
156

Mikael Salson's avatar
Mikael Salson committed
157 158 159 160
  ~KmerAffectAnalyser();

  int count() const;

161
  int count(const KmerAffect &affect) const;
Mikael Salson's avatar
Mikael Salson committed
162

163
  /**
164 165
   * @return a minimizing position on the matching affectations (taking into account the next affectations),
   *         or NO_MINIMIZING_POSITION if there is no such position
166 167 168
   * @param affect: affectation to match
   * @param margin: number of positions at both ends in which one does not take into account the matching affectation
   * @param width: number of affectations to consider, starting from the matching affectation, for the minimization
169
   */
170
  int minimize(const KmerAffect &affect, int margin, int width) const;
171

172
  const KmerAffect &getAffectation(int i) const;
Mikael Salson's avatar
Mikael Salson committed
173

174
  vector<KmerAffect> getAllAffectations(affect_options_t options) const;
Mikael Salson's avatar
Mikael Salson committed
175

176
  set<KmerAffect> getDistinctAffectations() const;
Mikael Salson's avatar
Mikael Salson committed
177

178 179
  IKmerStore<KmerAffect> &getIndex() const;

180
  /**
181 182
   * @param maxOverlap: if greater than kms.getS(), it is automatically set
   *                    to that value.
183 184 185 186
   * @return A structure where the maximum is such that those positions
   *         maximise the number of affectations before, minus the number of
   *         affectations after the returned positions.
   *
187 188
   *         The maximum reached must be
   *         such that the numbers of <before>/<after> in "good" positions
189 190
   *         (at the left of the leftmost max position for <before>,
   *         and at the right of the rightmost max position for <after>)
191
   *         are more than <ratioMin> times than the numbers of <before>/<after>
192
   *         in "bad" positions. If no so much maximum is found,
193 194 195 196
   *         the boolean <max_found> is set to false in the structure.
   *
   * @complexity time: linear in count(), space: constant
   */
197
  affect_infos getMaximum(const KmerAffect &before, const KmerAffect &after, 
198
                          float ratioMin=1.9,
199
                          int maxOverlap=1);
200

201 202 203 204

  /**
   * @return probability that the number of kmers is 'at_least' or more
   */
205
  double getProbabilityAtLeastOrAbove(const KmerAffect &kmer, int at_least) const;
206

207 208 209 210 211
  /**
   * @return probabilities that the number of left/right kmers is 'at_least' or more
   */
  pair <double, double> getLeftRightProbabilityAtLeastOrAbove() const;

Mikael Salson's avatar
Mikael Salson committed
212 213
  const string &getSequence() const;

214 215 216 217 218 219 220
  /**
   * @param  A pair of KmerAffects
   * @return The same pair of KmerAffects, but sorted.
   *         The first one is 'more on the left' than the second one.
   */
  pair <KmerAffect, KmerAffect> sortLeftRight(const pair <KmerAffect, KmerAffect> ka12) const;

221
  int first(const KmerAffect &affect) const;
Mikael Salson's avatar
Mikael Salson committed
222

223
  int last(const KmerAffect &affect) const ;
Mikael Salson's avatar
Mikael Salson committed
224 225

  string toString() const;
226 227 228 229 230

  string toStringValues() const;

  string toStringSigns() const;

Mikael Salson's avatar
Mikael Salson committed
231 232
};

233 234 235 236
/**
 * Class that allows to count in constant time the number of affectations
 * before or after a given point.
 */
237 238

class CountKmerAffectAnalyser: public KmerAffectAnalyser {
239
 private:
240
  map<KmerAffect, int* >counts;
241
  int overlap;
242 243
 public:

244
  CountKmerAffectAnalyser(IKmerStore<KmerAffect> &kms, const string &seq);
245 246
  ~CountKmerAffectAnalyser();

247 248
  int count() const;

249 250 251
  /**
   * @complexity constant time
   */
252
  int count(const KmerAffect &affect) const;
253 254 255 256 257

  /**
   * Count the number of an affectation before (strictly) than a position
   * @complexity constant time
   */
258
  int countBefore(const KmerAffect &affect, int pos) const;
259 260 261 262 263

  /**
   * Count the number of an affectation after (strictly) than a position)
   * @complexity constant time
   */
264
  int countAfter(const KmerAffect &affect, int pos) const;
265 266 267

  /**
   * @return the first position pos in the sequence such that 
268
   *         countBefore(before, pos - s) 
269
             + countAfter(after, pos) is maximal
270
   *         and pos >= start, and the maximum is greater than min; 
271
   *         or -1 if such a position doesn't exist.
272
   *         Where s is kms.getS() - getAllowedOverlap() - 1.
273 274
   * @complexity linear in getSequence().size() 
   */
275
  int firstMax(const KmerAffect &before, const KmerAffect &after, int start=0, int min=-1) const;
276 277 278

  /**
   * @return the last position pos in the sequence such that
279
   *         countBefore(before, pos - s) 
280
   *         + countAfter(after, pos) is maximal
281 282
   *         and pos <= end (if end == -1 considers end of sequence), and the 
   *         maximum is greater than min; or -1 if such a position doesn't exist.
283
   *         Where s is kms.getS() - getAllowedOverlap() - 1.
284 285
   * @complexity linear in getSequence().size()
   */
286
  int lastMax(const KmerAffect &before, const KmerAffect &after, int end=-1, int min=-1) const;
287

288 289 290 291 292 293
  /**
   * @return the allowed overlap between two k-mers with distinct affectations
   * (default is 0)
   */
  int getAllowedOverlap();

294 295 296 297 298 299 300 301 302
  /**
   * @parameter forbidden: a set of forbidden affectations that must not 
   *                       be taken into account for the max computation.
   * @pre There must have at least one affectation that is not forbidden otherwise
   *      the returned value is the unknown affectation.
   * @return the affectation that is seen the most frequently in the sequence
   *         taken apart the forbidden ones.
   * @complexity Time: linear on the number of distinct affectations
   */
303
  KmerAffect max(const set<KmerAffect> forbidden = set<KmerAffect>()) const;
304

305 306 307 308 309 310
  /*
   * @return the two affectations that are seen the most frequently in the sequence
   *         taken apart the forbidden ones.
   */
  pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const;
  
311 312 313 314 315 316 317
  /**
   * Set the overlap allowed between two k-mers with two different affectations,
   * when looking for the maximum.
   * The overlap should not be greater than the span of the seed used.
   */
  void setAllowedOverlap(int overlap);

318 319 320 321 322 323 324 325
 private:
  /**
   * Build the counts map.
   */
  void buildCounts();

  /**
   * Search the maximum. Used by firstMax and lastMax.
326
   * @param iter: should be either 1 or -1
327
   */
328
  int searchMax(const KmerAffect &before, const KmerAffect &after,
329
                int start, int end, int iter, int min) const;
330
};
Mikael Salson's avatar
Mikael Salson committed
331 332

#endif