kmerstore.h 13.4 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6 7 8
#ifndef KMERSTORE_H
#define KMERSTORE_H

#include <map>
#include <string>
#include <list>
#include <stdexcept>
#include <stdint.h>
9
#include <math.h>
Mikaël Salson's avatar
Mikaël Salson committed
10 11 12 13 14 15 16 17 18 19
#include "fasta.h"
#include "tools.h"

using namespace std;

class Kmer {
public:
  unsigned int count;

  Kmer();
20 21 22 23

  /**
   * This constructor is used via a IKmerStore<Kmer> index (hence the argument list)       
   */
24
  Kmer(const string &label, int strand=1);
25

Mikaël Salson's avatar
Mikaël Salson committed
26 27
  Kmer &operator+=(const Kmer &);
  static bool hasRevcompSymetry();
28 29 30 31 32

  /**
   * @return true if the element is the same as when initialised with default constructor.
   */
  bool isNull();
Mikaël Salson's avatar
Mikaël Salson committed
33 34 35 36 37 38 39 40 41 42 43 44 45
} ;
ostream &operator<<(ostream &os, const Kmer &kmer);


/* K-mer indexing */

template <class T> class IKmerStore
{
protected:
  bool revcomp_indexed;
  int k; // weight of the seed
  int s; // span of the seed (s >= k)
  string seed ;
46
  size_t nb_kmers_inserted;
47
  int max_size_indexing;
Mikaël Salson's avatar
Mikaël Salson committed
48 49 50 51 52

public:

  virtual ~IKmerStore();

53 54
  static int last_id;
  int id; // id of this index
55
  int refs; // number of germlines using this index
56

57
  list< pair <T, Fasta> > labels;
58

Mikaël Salson's avatar
Mikaël Salson committed
59 60 61
  /**
   * @param input: A single FASTA file
   * @param label: label that must be associated to the given files
62
   * @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
Mikaël Salson's avatar
Mikaël Salson committed
63
   */
64
  void insert(Fasta& input, const string& label="", int keep_only = 0);
Mikaël Salson's avatar
Mikaël Salson committed
65 66 67 68

  /**
   * @param input: A list of FASTA files
   * @param label: label that must be associated to the given files
69
   * @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
Mikaël Salson's avatar
Mikaël Salson committed
70
   */
71
  void insert(list<Fasta>& input, const string& label="", int keep_only = 0);
Mikaël Salson's avatar
Mikaël Salson committed
72 73 74 75
  
  /**
   * @param input: A sequence to be cut in k-mers
   * @param label: label that must be associated to the given files
76 77 78 79
   * @param keep_only: if > 0 will keep at most the last keep_only nucleotides
   *                   of the sequence. if < 0 will keep at most the first
   *                   keep_only nucleotides of the sequence. if == 0,
   *                   will keep all the sequence.
Mikaël Salson's avatar
Mikaël Salson committed
80 81
   * @post All the k-mers in the sequence have been indexed.
   */
82
  void insert(const seqtype &sequence,
83
              const string &label,
84
              bool ignore_extended_nucleotides=true, int keep_only = 0);
Mikaël Salson's avatar
Mikaël Salson committed
85 86 87 88 89

  /**
   * @param word: a k-mer
   * @return the value for the kmer as stored in the structure (don't do revcomp)
   */ 
90
  virtual T& get(seqtype &word) = 0;
Mikaël Salson's avatar
Mikaël Salson committed
91

92 93 94
  /**
   * @return the percentage of kmers that are set in the index
   */
95
  float getIndexLoad(T kmer) const;
96 97
  float getIndexLoad() const;

98 99 100 101 102 103
  /**
   * @return the given integer (size of a read),
   *         but limit this size to max_size_indexing if it was defined
   */
  int atMostMaxSizeIndexing(int n) const;

104 105 106
  /**
   * @return probability that the number of kmers is 'at_least' or more in a sequence of length 'length'
   */
107
  double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length) const;
108

Mikaël Salson's avatar
Mikaël Salson committed
109 110 111 112 113 114 115 116 117 118
  /**
   * @return the value of k
   */
  int getK() const;

  /**
   * @return the value of s
   */
  int getS() const;

119 120 121 122 123
  /**
   * @return the seed used
   */
  string getSeed() const;

124 125 126 127
  /**
   * @param kmer: a kmer
   * @return one label associated with the kmer
   */
128
  Fasta getLabel(T kmer) const;
129

Mikaël Salson's avatar
Mikaël Salson committed
130 131 132 133 134 135 136
  /**
   * @param seq: a sequence
   * @param no_revcomp: force not to revcomp the sequence, even if
   *                    the index was built with revcomp.
   * @return a vector of length seq.length() - getK() + 1 containing
   * for each k-mer the corresponding value in the index.
   */
137
  virtual vector<T> getResults(const seqtype &seq, bool no_revcomp=false) = 0;
Mikaël Salson's avatar
Mikaël Salson committed
138 139 140 141 142 143 144 145 146 147 148 149

  /**
   * @return true iff the revcomp is indexed
   */
  bool isRevcomp() const;

  /**
   * @param string: a k-mer
   * @return the value in the index associated to the given k-mer.
   *         isRevcomp() ==> the revcomp of the k-mer may be considered
   * @pre word.length() == this->k
   */
150
  virtual T& operator[](seqtype& word) = 0;
Mikaël Salson's avatar
Mikaël Salson committed
151 152
};

153
template<class T> int IKmerStore<T>::last_id = 0;
Mikaël Salson's avatar
Mikaël Salson committed
154 155 156 157 158

template <class T> 
class MapKmerStore : public IKmerStore<T>
{
public:	
159
  map<seqtype, T> store;
Mikaël Salson's avatar
Mikaël Salson committed
160 161 162 163 164 165 166

  /**
   * @param bool: if true, will also index revcomp
   * @param seed: the seed
   */
  MapKmerStore(string seed, bool=false);
  MapKmerStore(int k, bool=false);
167 168

  vector<T> getResults(const seqtype &seq, bool no_revcomp=false);
169 170
  T& get(seqtype &word);
  T& operator[](seqtype & word);
171 172 173

 private:
  void init();
Mikaël Salson's avatar
Mikaël Salson committed
174 175 176 177 178 179 180
};

template <class T> 
class ArrayKmerStore : public IKmerStore<T>
{
  T* store;
	
181
  int index(const seqtype& word) const;
182 183 184 185 186 187 188 189

  /**
   * Allocates memory for store
   * @throws bad_alloc: when there is not enough memory or when the
   *                    the total number of kmers cannot be stored in an 
   *                    unsigned int
   */
  void init();
Mikaël Salson's avatar
Mikaël Salson committed
190 191 192 193 194 195 196 197
public:
  /**
   * @param bool: if true, will also index revcomp
   * @param seed: the seed
   */
  ArrayKmerStore(string seed, bool=false);
  ArrayKmerStore(int k, bool=false);
  ~ArrayKmerStore();
198 199

  vector<T> getResults(const seqtype &seq, bool no_revcomp=false);	
200 201
  T& get(seqtype &word);
  T& operator[](seqtype & word);
202
  T& operator[](int word);
Mikaël Salson's avatar
Mikaël Salson committed
203 204 205 206 207 208 209 210 211 212
};


// IKmerStore

template<class T>
IKmerStore<T>::~IKmerStore(){}

template<class T> 
void IKmerStore<T>::insert(list<Fasta>& input,
213 214
                           const string &label,
                           int keep_only){
Mikaël Salson's avatar
Mikaël Salson committed
215
  for(list<Fasta>::iterator it = input.begin() ; it != input.end() ; it++){
216
    insert(*it, label, keep_only);
Mikaël Salson's avatar
Mikaël Salson committed
217 218 219 220 221
  }
}

template<class T> 
void IKmerStore<T>::insert(Fasta& input,
222 223
                           const string &label,
                           int keep_only){
Mikaël Salson's avatar
Mikaël Salson committed
224
  for (int r = 0; r < input.size(); r++) {
225
    insert(input.sequence(r), label, true, keep_only);
Mikaël Salson's avatar
Mikaël Salson committed
226
  }
227

228
  labels.push_back(make_pair(T(label, 1), input)) ;
229 230

  if (revcomp_indexed  && ! T::hasRevcompSymetry()) {
231
    labels.push_back(make_pair(T(label, -1), input)) ;
232
  }
Mikaël Salson's avatar
Mikaël Salson committed
233 234 235
}

template<class T> 
236
void IKmerStore<T>::insert(const seqtype &sequence,
237
                           const string &label,
238 239 240 241 242 243 244 245 246
                           bool ignore_extended_nucleotides,
                           int keep_only){
  size_t start_indexing = 0;
  size_t end_indexing = sequence.length();
  if (keep_only > 0 && sequence.length() > (size_t)keep_only) {
    start_indexing = sequence.length() - keep_only;
  } else if (keep_only < 0 && sequence.length() > (size_t) -keep_only) {
    end_indexing = -keep_only;
  }
247 248 249 250 251 252

  size_t size_indexing = end_indexing - start_indexing;
  if (size_indexing > max_size_indexing) {
    max_size_indexing = size_indexing;
  }

253
  for(size_t i = start_indexing ; i + s < end_indexing + 1 ; i++) {
254 255
    seqtype substr = sequence.substr(i, s);
    seqtype kmer = spaced(substr, seed);
256

257
    if (ignore_extended_nucleotides && has_extended_nucleotides(kmer))
258 259
      continue;

Mikaël Salson's avatar
Mikaël Salson committed
260 261
    int strand = 1;
    if (revcomp_indexed && T::hasRevcompSymetry()) {
262
      seqtype rc_kmer = revcomp(kmer);
Mikaël Salson's avatar
Mikaël Salson committed
263 264 265 266 267
      if (rc_kmer.compare(kmer) < 0) {
        strand = -1;
        kmer = rc_kmer;
      }
    }
268 269 270 271 272
    T &this_kmer = this->get(kmer);
    if (this_kmer.isNull()) {
      nb_kmers_inserted++;
    }
    this_kmer += T(label, strand);
Mikaël Salson's avatar
Mikaël Salson committed
273
    if (revcomp_indexed && ! T::hasRevcompSymetry()) {
274
      seqtype rc_kmer = spaced(revcomp(substr), seed);
275 276 277 278
      T &this_rc_kmer = this->get(rc_kmer);
      if (this_rc_kmer.isNull())
        nb_kmers_inserted++;
      this_rc_kmer += T(label, -1);
Mikaël Salson's avatar
Mikaël Salson committed
279 280 281
    }
  }
}
282

283 284
template<class T>
float IKmerStore<T>::getIndexLoad(const T kmer) const {
285
  return kmer.isUnknown() ? 1 - getIndexLoad() : getIndexLoad();
286
}
287 288
template<class T>
float IKmerStore<T>::getIndexLoad() const {
289
  return nb_kmers_inserted / pow(4.0, k);
290
}
Mikaël Salson's avatar
Mikaël Salson committed
291

292 293 294 295 296 297 298
template<class T>
int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
  if (!max_size_indexing || n < max_size_indexing)
    return n ;

  return max_size_indexing ;
}
299 300

template<class T>
301
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) const {
302 303 304

  // n: number of kmers in the sequence
  int n = length - getS() + 1;
305
  float index_load = getIndexLoad(kmer) ;
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320

  double proba = 0;
  double probability_having_system = pow(index_load, at_least);
  double probability_not_having_system = pow(1 - index_load, n - at_least);
  for (int i=at_least; i<=n; i++) {
    proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
    probability_having_system *= index_load;
    probability_not_having_system /= (1 - index_load);
  }

  return proba;
}



Mikaël Salson's avatar
Mikaël Salson committed
321 322 323 324 325 326 327 328 329 330
template<class T>
int IKmerStore<T>::getK() const {
  return k;
}

template<class T>
int IKmerStore<T>::getS() const {
  return s;
}

331 332 333 334 335
template<class T>
string IKmerStore<T>::getSeed() const {
  return seed;
}

336
template<class T>
337 338
Fasta IKmerStore<T>::getLabel(T kmer) const {
  for (typename list< pair<T, Fasta> >::const_iterator it = labels.begin(); it != labels.end(); ++it)
339 340
    if (it->first == kmer)
      return it->second ;
341
  return FASTA_AMBIGUOUS ;
342
}
343 344

// .getResults()
Mikaël Salson's avatar
Mikaël Salson committed
345
template<class T>
346 347 348 349
vector<T> MapKmerStore<T>::getResults(const seqtype &seq, bool no_revcomp) {

  int s = IKmerStore<T>::getS();

Mikaël Salson's avatar
Mikaël Salson committed
350 351 352 353 354
  if ((int)seq.length() < s - 1) {
    return vector<T>(0);
  }
  vector<T> result(seq.length() - s + 1);
  for (size_t i=0; i + s < seq.length() + 1; i++) {
355
    seqtype kmer = spaced(seq.substr(i, s), IKmerStore<T>::seed);
356
    //    seqtype kmer = seq.substr(i, s);
Mikaël Salson's avatar
Mikaël Salson committed
357
    // cout << kmer << endl << kmer0 << endl << endl ;
358
    if (IKmerStore<T>::revcomp_indexed && no_revcomp) {
Mikaël Salson's avatar
Mikaël Salson committed
359 360 361 362 363 364 365 366
      result[i] = get(kmer);
    } else {
      result[i] = (*this)[kmer];
    }
  }
  return result;
}

367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
template<class T>
vector<T> ArrayKmerStore<T>::getResults(const seqtype &seq, bool no_revcomp) {
  
  int s = IKmerStore<T>::getS();

  int N = (int)seq.length();

  if (N < s - 1) {
    return vector<T>(0);
  }
  vector<T> result(N - s + 1);

  /* Read once the sequence, convert it to int* */
  int* intseq = new int[N];
  for (int i=0; i<N; i++)
    {
383
      intseq[i] = nuc_to_int(seq[i]);
384 385 386 387 388 389 390 391 392
    }

  /* Compute results */
  for (size_t i=0; (int) i+s < N+1; i++) {
    int kmer = spaced_int(intseq + i, IKmerStore<T>::seed);
    if (IKmerStore<T>::revcomp_indexed && no_revcomp) {
      result[i] = store[kmer]; // getfromint(kmer); // store[kmer];
      // cout << i << "/" << N << "  " << kmer << result[i] << endl ;
    } else {
393
      result[i] = (*this)[kmer]; // Deals with revcomp
394 395 396 397 398 399 400
    }
  }

  delete[] intseq ;
  return result;
}

Mikaël Salson's avatar
Mikaël Salson committed
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

template<class T>
bool IKmerStore<T>::isRevcomp() const {
  return revcomp_indexed;
}

// MapKmerStore

template <class T>
MapKmerStore<T>::MapKmerStore(string seed, bool revcomp){
  this->seed = seed;   
  int k = seed_weight(seed);
  this->k = k;
  this->s = seed.size();
  this->revcomp_indexed = revcomp;
416
  init();
Mikaël Salson's avatar
Mikaël Salson committed
417 418 419 420 421 422 423 424
}

template <class T>
MapKmerStore<T>::MapKmerStore(int k, bool revcomp){
  this->seed = seed_contiguous(k);
  this->k = k;
  this->s = k;
  this->revcomp_indexed = revcomp;
425 426 427 428 429 430
  init();
}

template <class T>
void MapKmerStore<T>::init() {
  this->nb_kmers_inserted = 0;
431
  this->max_size_indexing = 0;
Mikaël Salson's avatar
Mikaël Salson committed
432 433 434
}

template <class T> 
435
T& MapKmerStore<T>::get(seqtype& word){
Mikaël Salson's avatar
Mikaël Salson committed
436 437 438 439
  return store[word];
}

template <class T> 
440
T& MapKmerStore<T>::operator[](seqtype& word){
Mikaël Salson's avatar
Mikaël Salson committed
441
  if (this->isRevcomp() && T::hasRevcompSymetry()) {
442
    seqtype rc_kmer = revcomp(word);
Mikaël Salson's avatar
Mikaël Salson committed
443 444 445 446 447 448 449 450 451
    if (rc_kmer.compare(word) < 0)
      word = rc_kmer;
  }
  return store[word];
}

// ArrayKmerStore

template <class T> 
452
ArrayKmerStore<T>::ArrayKmerStore(int k, bool revcomp) {
Mikaël Salson's avatar
Mikaël Salson committed
453 454 455 456
  this->seed = seed_contiguous(k);
  this->k = k;
  this->s = k;
  this->revcomp_indexed = revcomp;
457
  init();
Mikaël Salson's avatar
Mikaël Salson committed
458 459 460 461 462 463 464 465 466 467
}


template <class T> 
ArrayKmerStore<T>::ArrayKmerStore(string seed, bool revcomp){
  this->seed = seed; 
  int k = seed_weight(seed);
  this->k = k;
  this->s = seed.size();
  this->revcomp_indexed = revcomp;
468 469 470 471 472
  init();
}

template <class T>
void ArrayKmerStore<T>::init() {
473
  this->nb_kmers_inserted = 0;
474
  this->max_size_indexing = 0;
475
  if ((size_t)(this->k << 1) >= sizeof(int) * 8)
Mikaël Salson's avatar
Mikaël Salson committed
476
    throw std::bad_alloc();
477
  store = new T[(unsigned int)1 << (this->k << 1)];
Mikaël Salson's avatar
Mikaël Salson committed
478 479 480 481 482 483 484 485 486 487 488 489 490 491
  if (! store)
    throw std::bad_alloc();
}

template <class T> 
ArrayKmerStore<T>::~ArrayKmerStore(){
  delete [] store;
}

/**	Returns index of word in an array of size 4^k
	Considering B(A) = 0, B(C) = 1, B(T) = 2, B(G) = 3
	index_word = \sum_{i=0}^{k-1}B(word[i])*4^(k-i-1)
**/
template <class T> 
492
int ArrayKmerStore<T>::index(const seqtype& word) const{
Mikaël Salson's avatar
Mikaël Salson committed
493
  return dna_to_int(word, this->k);
Mikaël Salson's avatar
Mikaël Salson committed
494 495 496
}

template <class T> 
497
T& ArrayKmerStore<T>::get(seqtype& word){
Mikaël Salson's avatar
Mikaël Salson committed
498 499 500 501
  return store[index(word)];
}

template <class T> 
502
T& ArrayKmerStore<T>::operator[](seqtype& word){
503 504 505 506 507
  return (*this)[index(word)];
}

template <class T> 
T& ArrayKmerStore<T>::operator[](int word){
Mikaël Salson's avatar
Mikaël Salson committed
508
  if (this->isRevcomp() && T::hasRevcompSymetry()) {
509 510
    int rc_kmer = revcomp_int(word, IKmerStore<T>::k);
    if (rc_kmer < word)
Mikaël Salson's avatar
Mikaël Salson committed
511 512
      word = rc_kmer;
  }
513
  return store[word];
Mikaël Salson's avatar
Mikaël Salson committed
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
}


/**
 * KmerStoreFactory is a factory that allows to create an index that best fits
 * your needs!
 */
class KmerStoreFactory {
 public:
  template<class T>
    static IKmerStore<T> *createIndex(string seed, bool revcomp=false);
  template<class T>
    static IKmerStore<T> *createIndex(int k, bool revcomp=false);
};

template<class T>
IKmerStore<T> *KmerStoreFactory::createIndex(string seed, bool revcomp) {
  IKmerStore<T> *index;
  try{
    index = new ArrayKmerStore<T>(seed, revcomp);
  }catch(exception e){
535
    cout << "  (using a MapKmer to fit into memory)" << endl;
Mikaël Salson's avatar
Mikaël Salson committed
536 537
    index = new MapKmerStore<T>(seed, revcomp);
  }
538 539 540

  index->id = ++IKmerStore<T>::last_id;

Mikaël Salson's avatar
Mikaël Salson committed
541 542 543 544 545 546 547 548 549
  return index;
}

template<class T>
IKmerStore<T> *KmerStoreFactory::createIndex(int k, bool revcomp) {
  return createIndex<T>(seed_contiguous(k), revcomp);
}

#endif