windows.cpp 9.45 KB
Newer Older
1 2
#include <iostream>
#include <string>
Mathieu Giraud's avatar
Mathieu Giraud committed
3
#include "tools.h"
4
#include "../lib/json.hpp"
Mathieu Giraud's avatar
Mathieu Giraud committed
5 6
#include "windows.h"
#include "representative.h"
7
#include "segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
8

9 10
WindowsStorage::WindowsStorage(map<string, string> &labels)
  :windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
Mathieu Giraud's avatar
Mathieu Giraud committed
11

12
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
Mathieu Giraud's avatar
Mathieu Giraud committed
13 14 15
  return sort_all_windows;
}

16 17 18 19 20 21 22 23
map<junction, BinReadStorage>::iterator WindowsStorage::begin() {
  return seqs_by_window.begin();
}

map<junction, BinReadStorage>::iterator WindowsStorage::end() {
  return seqs_by_window.end();
}

24 25
float WindowsStorage::getAverageLength(junction window) {
  assert(hasWindow(window));
26
  return seqs_by_window[window].getAverageLength();
27 28
}

29
Germline *WindowsStorage::getGermline(junction window) {
30 31 32 33
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  if (result == germline_by_window.end())
    return NULL;
  return result->second;
34 35
}

36 37 38 39
size_t WindowsStorage::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

40 41
json WindowsStorage::statusToJson(junction window) {
    json result;
42
    
43
    for (unsigned int i=0; i<status_by_window[window].size(); i++){
44 45 46
        if (status_by_window[window][i] !=0){
            ostringstream oss; 
            oss << i;
47
            result[oss.str()] = status_by_window[window][i];
48 49 50 51 52 53
        }
    }
    
    return result;
}

54 55
size_t WindowsStorage::getNbReads(junction window) {
  assert(hasWindow(window));
56
  return seqs_by_window[window].getNbInserted();
57 58
}

59
list<Sequence> WindowsStorage::getReads(junction window) {
60
  return seqs_by_window[window].getReads();
Mathieu Giraud's avatar
Mathieu Giraud committed
61 62
}

63
KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction window,
Mathieu Giraud's avatar
Mathieu Giraud committed
64 65
                                           string seed, size_t min_cover, 
                                           float percent_cover,
66
                                           size_t nb_sampled) {
67
  assert(! hasLimitForReadsPerWindow() || nb_sampled <= getMaximalNbReadsPerWindow());
Mathieu Giraud's avatar
Mathieu Giraud committed
68
  list<Sequence> auditioned_sequences 
69
    = getSample(window,nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
70
  KmerRepresentativeComputer repComp(auditioned_sequences, seed);
71
  repComp.setRevcomp(true);
72
  repComp.setMinCover((! isInterestingJunction(window)) ? min_cover : 1);
73
  repComp.setPercentCoverage(percent_cover);
74
  repComp.setRequiredSequence(window);
75
  repComp.setCoverageReferenceLength(getAverageLength(window));
76
  repComp.compute();
77

78
  // We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
79 80
  // - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
  // - and 'min_cover' = min('min_reads_clone', 'max_auditioned')
81
  // - and these sequence are at least as long as the seed
82 83 84
  if (!repComp.hasRepresentative())
    throw invalid_argument("No representative for junction " + window);

85
  return repComp;
Mathieu Giraud's avatar
Mathieu Giraud committed
86 87
}

88
list<Sequence> WindowsStorage::getSample(junction window, size_t nb_sampled) {
89
  return seqs_by_window[window].getBestReads(nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
90
}
WebTogz's avatar
WebTogz committed
91

92 93 94 95 96 97
set<Germline *> WindowsStorage::getTopGermlines(size_t top, size_t min_reads) {
  assert(sort_all_windows.size() == seqs_by_window.size());

  set<Germline *> top_germlines;
  size_t count = 0;

98
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
99 100 101 102 103 104 105 106
       it != sort_all_windows.end() && count < top && (size_t)it->second >= min_reads;
       ++it, ++count) {
    top_germlines.insert(getGermline(it->first));
  }

  return top_germlines;
}

107 108 109 110
bool WindowsStorage::hasLimitForReadsPerWindow() {
  return max_reads_per_window != (size_t)~0;
}

111 112 113 114 115
bool WindowsStorage::hasWindow(junction window) {
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  return (result != germline_by_window.end());
}

116 117
string WindowsStorage::getLabel(junction window) {

118 119 120 121 122 123 124 125 126 127 128
  bool found = false;
  for (auto it: windows_labels) {
    string sequence_of_interest = it.first;
    if (sequence_of_interest.size() < window.size()) {
      found = window.find(sequence_of_interest) != string::npos
        || window.find(revcomp(sequence_of_interest)) != string::npos;
    } else {
      found = sequence_of_interest.find(window) != string::npos
        || sequence_of_interest.find(revcomp(window)) != string::npos;
    }
    if (found)
129
      return it.second;
130
  }
131 132 133 134 135
  return "";
}

bool WindowsStorage::isInterestingJunction(junction window) {
  return (getLabel(window).length() != 0) ;
136 137
}

Mathieu Giraud's avatar
Mathieu Giraud committed
138 139 140 141
size_t WindowsStorage::size() {
  return seqs_by_window.size();
}

142 143 144 145 146
void WindowsStorage::setBinParameters(size_t nb, size_t max_value) {
  nb_bins = nb;
  max_value_bins = max_value;
}

WebTogz's avatar
WebTogz committed
147 148
void WindowsStorage::setIdToAll() {
    int id = 0;
149
    for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
WebTogz's avatar
WebTogz committed
150 151 152 153 154 155
        it != seqs_by_window.end(); ++it) {
            id_by_window.insert(make_pair(it->first, id));
            id++;
    }
}

156
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline, list<int> extra_statuses) {
157
  if (! hasWindow(window)) {
158
    // First time we see that window: init
159
    status_by_window[window].resize(STATS_SIZE);
160 161
    seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
    seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
162
  }
163 164

  seqs_by_window[window].add(sequence);
165
  status_by_window[window][status]++;
166

167 168 169
  for (int extra: extra_statuses)
    status_by_window[window][extra]++;

170
  germline_by_window[window] = germline;
Mathieu Giraud's avatar
Mathieu Giraud committed
171 172
}

173
pair <int, size_t> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
Mathieu Giraud's avatar
Mathieu Giraud committed
174
  int removes = 0 ;
175
  size_t nb_reads = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
176

177
  for (map <junction, BinReadStorage >::iterator it = seqs_by_window.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
178 179 180
       it != seqs_by_window.end(); ) // We do not advance the iterator here because of the deletion
    {
      junction junc = it->first;
181
      size_t nb_reads_this_window = getNbReads(junc);
182
      // Is it not supported by enough reads?
183
      if (!(nb_reads_this_window >= min_reads_window)
184
          // Is it not a labelled junction?
185
          && ! isInterestingJunction(junc))
Mathieu Giraud's avatar
Mathieu Giraud committed
186
        {
187
          map <junction, BinReadStorage >::iterator toBeDeleted = it;
Mathieu Giraud's avatar
Mathieu Giraud committed
188 189 190 191 192
          it++;
          seqs_by_window.erase(toBeDeleted);
          removes++ ;
        }
      else {
193
        nb_reads += nb_reads_this_window;
Mathieu Giraud's avatar
Mathieu Giraud committed
194 195 196
        it++;
      }
    }
197 198

  sort_all_windows.clear();
Mathieu Giraud's avatar
Mathieu Giraud committed
199 200 201
  return make_pair(removes, nb_reads);
}

202 203 204 205
void WindowsStorage::setMaximalNbReadsPerWindow(size_t max_reads){
  max_reads_per_window = max_reads;
}
  
Mathieu Giraud's avatar
Mathieu Giraud committed
206
void WindowsStorage::sort() {
207
  sort_all_windows.clear();
208
  for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
209
       it != seqs_by_window.end(); ++it)
Mathieu Giraud's avatar
Mathieu Giraud committed
210
    {
211
      sort_all_windows.push_back(make_pair(it->first, it->second.getNbInserted()));
Mathieu Giraud's avatar
Mathieu Giraud committed
212 213 214 215 216 217 218 219
    }

  sort_all_windows.sort(pair_occurrence_sort<junction>);
}

ostream &WindowsStorage::printSortedWindows(ostream &os) {
  int num_seq = 0 ;

220
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
221 222 223 224
       it != sort_all_windows.end(); ++it) 
    {
      num_seq++ ;

225
      windowToStream(os, it->first, num_seq, it->second);
Mathieu Giraud's avatar
Mathieu Giraud committed
226 227 228 229
    }
  return os;
}

230 231 232 233 234 235 236


json WindowsStorage::computeDiversity(int nb_segmented) {

  double index_H_entropy = 0.0 ;
  double index_1_minus_Ds_diversity = 0.0 ;

237 238
  double nb_seg_nb_seg_m1 = (double) nb_segmented * ((double) nb_segmented - 1);

239 240 241 242 243 244
  for (auto it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) {
    size_t clone_nb_reads = it->second.getNbInserted();

    float ratio = (float) clone_nb_reads / nb_segmented ;
    index_H_entropy -= ratio * log(ratio) ;

245
    index_1_minus_Ds_diversity += ((double) clone_nb_reads * ((double) clone_nb_reads - 1)) / nb_seg_nb_seg_m1 ;
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  }

  float index_E_equitability  = index_H_entropy / log(nb_segmented) ;
  float index_Ds_diversity = 1 - index_1_minus_Ds_diversity ;

  cout << "Diversity measures" << endl
       << "  H = " << index_H_entropy << endl        // Shannon's diversity
       << "  E = " << index_E_equitability  << endl  // Shannon's equitability
       << " Ds = " << index_Ds_diversity << endl     // Simpson's diversity
       << endl;

  json jsonDiversity;
  jsonDiversity["index_H_entropy"] = index_H_entropy ;
  jsonDiversity["index_E_equitability"] = index_E_equitability ;
  jsonDiversity["index_Ds_diversity"] = index_Ds_diversity ;

  return jsonDiversity;
}

265 266 267
void WindowsStorage::clearSequences(){
  seqs_by_window.clear();
}
268

269 270
void WindowsStorage::sortedWindowsToOutput(SampleOutput *output, int max_output, bool delete_all) {

Mathieu Giraud's avatar
Mathieu Giraud committed
271 272
  int top = 1;
    
273
  for (list<pair <junction, size_t> >::iterator it = sort_all_windows.begin();
274
       it != sort_all_windows.end(); )
Mathieu Giraud's avatar
Mathieu Giraud committed
275
    {
276
       
277
      CloneOutput *clone = output->getClone(it->first);
278

279
      if (status_by_window[it->first][SEG_CHANGED_WINDOW])
280
        clone->add_warning("W50", "Short or shifted window", LEVEL_WARN);
281

282 283 284 285 286
      clone->set("id", it->first);
      clone->set("reads", {it->second});
      clone->set("top", top++);
      clone->set("germline", germline_by_window[it->first]->code);
      clone->set("seg_stat", this->statusToJson(it->first));
287
      
288 289 290 291 292 293 294
      if (delete_all) {
        germline_by_window.erase(it->first);
        status_by_window.erase(it->first);
        it = sort_all_windows.erase(it);
      } else {
        it++;
      }
295
      if (top == max_output + 1)
296
        break ;
Mathieu Giraud's avatar
Mathieu Giraud committed
297 298 299 300 301
    }
}

ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq, 
                                        size_t size) {
302
  os << ">" << size << "--window--" << num_seq << " " << getLabel(window) << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
303 304 305
  os << window << endl;
  return os;
}