windows.h 7.06 KB
Newer Older
Mathieu Giraud's avatar
Mathieu Giraud committed
1 2 3 4 5 6 7 8 9 10 11
#ifndef WINDOWS_H
#define WINDOWS_H

/** 
 * A window is associated to a list of sequences. We deal with a list of
 * windows. We'd like to sort it, to output it, to remove windows not appearing
 * enough (apart from the ones that are labelled).
 */

#include <iostream>
#include <map>
12
#include <set>
Mathieu Giraud's avatar
Mathieu Giraud committed
13 14
#include <utility>
#include <string>
15
#include "bioreader.hpp"
16
#include "segment.h"
17
#include "germline.h"
18 19
#include "read_storage.h"
#include "read_score.h"
20
#include "representative.h"
21
#include "stats.h"
22
#include "../lib/json_fwd.hpp"
23

24
#define NB_BINS 30
25
#define MAX_VALUE_BINS 500
Mathieu Giraud's avatar
Mathieu Giraud committed
26 27

using namespace std;
Marc Duez's avatar
Marc Duez committed
28
using json = nlohmann::json;
Mathieu Giraud's avatar
Mathieu Giraud committed
29 30 31 32 33

typedef string junction ;

class WindowsStorage {
 private:
34
  map<junction, BinReadStorage > seqs_by_window;
35
  map<junction, vector<int> > status_by_window;
36
  map<junction, Germline* > germline_by_window;
Mathieu Giraud's avatar
Mathieu Giraud committed
37
  map<string, string> windows_labels;
38
  list<pair <junction, size_t> > sort_all_windows;
WebTogz's avatar
WebTogz committed
39
  map<junction, int> id_by_window;
40
  size_t max_reads_per_window;
41
  ReadQualityScore scorer;
42 43 44 45

  /* Parameters for the read storage */
  size_t nb_bins;
  size_t max_value_bins;
Mathieu Giraud's avatar
Mathieu Giraud committed
46 47 48 49 50 51 52 53
 public:
  /**
   * Build an empty storage, with the labels that correspond to specific
   * windows that must be kept. The map<string, string> keys are DNA sequences
   * corresponding to the window, while values are the name of the sequence.
   */
  WindowsStorage(map<string, string> &labels);

54 55 56 57
  /**
   * @return a pointer to the germline of the window
   *         or NULL if the window doesn't exist.
   */
58
  Germline *getGermline(junction window);
59 60 61

  map<junction, BinReadStorage>::iterator begin();
  map<junction, BinReadStorage>::iterator end();
62
  
Marc Duez's avatar
Marc Duez committed
63
  json statusToJson(junction window);
64

65 66 67 68 69
  /**
   * @return the average read length of the reads segmented with the given window
   */
  float getAverageLength(junction window);

70 71 72 73 74
  /**
   * @return the maximal number of reads that can be stored for a window.
   */
  size_t getMaximalNbReadsPerWindow();

75 76 77 78 79
  /**
   * @pre hasWindow(window)
   * @return the total number of reads supporting a window.
   */
  size_t getNbReads(junction window);
80
  
Mathieu Giraud's avatar
Mathieu Giraud committed
81 82 83
  /**
   * @return the list of reads supporting a given window
   */
84
  list<Sequence> getReads(junction window);
Mathieu Giraud's avatar
Mathieu Giraud committed
85 86 87 88 89 90 91 92 93 94

  /**
   * @param window: the window shared by all the sequences
   * @param seed: the seed used for the sequence similarity search
   * @param min_cover: see compute() in RepresentativeComputer
   * @param percent_cover: see compute() in RepresentativeComputer
   * @param nb_sampled: Number of sequences sampled to get the representatives. 
   *                    Sampling sequences allow to have a more time efficient 
   *                    algorithm.
   * @param nb_buckets: Number of buckets for sampling (see SequenceSampler)
95
   * @pre nb_sampled <= getMaximalNbReadsPerWindow() if hasLimitForReadsPerWindow()
96
   * @return the representative computer
Mathieu Giraud's avatar
Mathieu Giraud committed
97
   */
98 99

  KmerRepresentativeComputer getRepresentativeComputer(junction window, string seed, size_t min_cover,
100
                             float percent_cover, size_t nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
101 102 103 104 105

  /**
   * @return a sample of nb_sampled sequences sharing the same window. The
   *         returned sequences are among the longest ones but are not sorted.
   */
106
  list<Sequence> getSample(junction window, size_t nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
107

108

109 110 111 112 113 114
  /**
   * @return true iff a limit has been set for the maximal number of reads per
   * window
   */
  bool hasLimitForReadsPerWindow();

115 116 117 118
  /**
   * @return true iff the window has been reported.
   */
  bool hasWindow(junction window);
119

120 121 122 123 124
  /**
   * @return the related label iff the window is contained (or contains) a sequence of interest
   */
  string getLabel(junction window);

125 126 127 128 129
  /**
   * @return true iff the window is contained (or contains) a sequence of interest
   */
  bool isInterestingJunction(junction window);

Mathieu Giraud's avatar
Mathieu Giraud committed
130 131
  /**
   * @return a list of windows together with the number of reads they appear in.
WebTogz's avatar
WebTogz committed
132 133
   * @pre sort() must have been called at least once and must have been called
   *      again after calling keepInterestingWindows()
Mathieu Giraud's avatar
Mathieu Giraud committed
134
   */
135
  list<pair <junction, size_t> > &getSortedList();
Mathieu Giraud's avatar
Mathieu Giraud committed
136 137 138 139 140 141

  /**
   * The number of windows stored
   */
  size_t size();

WebTogz's avatar
WebTogz committed
142 143 144 145 146
  /**
   * @return the id of the window, by his sequence
   */
  int getId(junction window);

147 148 149 150 151 152 153 154 155
  /**
   * Sets the parameters of the bins used for storing the reads.
   * @param nb: Number of bins (>= 0)
   * @param max_value: maximal value to be stored (>= 0).
   *        Any value greater than max_value will be put
   *        in an additional bin.
   */
  void setBinParameters(size_t nb, size_t max_value);

WebTogz's avatar
WebTogz committed
156 157 158 159 160
  /**
   * Give an id to all the windows, in id_by_window map
   */
  void setIdToAll();

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
  /**
   * For each window the maximal number of reads actually stored is
   * max_reads. This applies only to future reads added. Not to reads that
   * have been previously added.  In other words if for some window w,
   * getReads(w).size() > max_reads, no reads will be removed. However no
   * reads will be added for that window.  getNbReads() still returns the real
   * number of reads for a given window, not the number of reads stored for a
   * window.
   * When the limit is reached the better reads are preferred over the less good
   * therefore reads may be replaced so that the list contains the best ones.
   * @param max_reads: Maximal number of reads stored for a window. 
   *                   ~0 for no limit.
   */
  void setMaximalNbReadsPerWindow(size_t max_reads);

Mathieu Giraud's avatar
Mathieu Giraud committed
176
  /**
177 178 179 180 181
   * Add a new window with its sequence.
   * @param window: the window to add
   * @param sequence: the corresponding Sequence
   * @param status: the segmentation status
   * @param germline: the germline where this sequence has been segmented
Mathieu Giraud's avatar
Mathieu Giraud committed
182
   */
183
  void add(junction window, Sequence sequence, int status, Germline *germline,
184
           list<int> extra_statuses = list<int>{});
Mathieu Giraud's avatar
Mathieu Giraud committed
185

186 187 188 189 190 191
  /**
   * @pre should be called before keepInterestingWindows()
   * Compute, display, and return some diversity measures
   */
  json computeDiversity(int nb_segmented);

192 193 194 195 196 197 198 199 200 201
  /**
   * @pre sort() must have been called.
   * @param top: Only the germlines of the top most abundant windows will
   *             be considered
   * @param min_reads: (optional) minimal number (inclusive) of reads the window
   *                   must be supported by.
   * @return a set of the most abundant germlines.
   */
  set<Germline *> getTopGermlines(size_t top, size_t min_reads=1);

Mathieu Giraud's avatar
Mathieu Giraud committed
202 203 204 205 206 207 208
  /**
   * Only keep windows that are interesting.  Those windows are windows
   * supported by at least min_reads_window reads as well as windows that are
   * labelled.  
   * @return the number of windows removed as well as the number of
   * reads finally kept.
   */
209
  pair<int, size_t> keepInterestingWindows(size_t min_reads_window);
Mathieu Giraud's avatar
Mathieu Giraud committed
210 211 212 213 214 215 216 217 218 219

  /**
   * sort windows according to the number of reads they appear in
   */
  void sort();

  /**
   * Print the windows from the most abundant to the least abundant
   */ 
  ostream &printSortedWindows(ostream &os);
Marc Duez's avatar
Marc Duez committed
220 221
  
  json sortedWindowsToJson(map<junction, json> json_data_segment);
Mathieu Giraud's avatar
Mathieu Giraud committed
222 223 224 225 226 227 228 229

  /**
   * Display a window with its in size in a somewhat FASTA format
   */
  ostream &windowToStream(ostream &os, junction window, int num_seq, size_t size);
};

#endif