windows.cpp 9.44 KB
Newer Older
1 2
#include <iostream>
#include <string>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4 5
#include "tools.h"
#include "windows.h"
#include "representative.h"
6
#include "segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
7

8 9
WindowsStorage::WindowsStorage(map<string, string> &labels)
  :windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
Mathieu Giraud's avatar
Mathieu Giraud committed
10

11
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
Mathieu Giraud's avatar
Mathieu Giraud committed
12 13 14
  return sort_all_windows;
}

15 16 17 18 19 20 21 22
map<junction, BinReadStorage>::iterator WindowsStorage::begin() {
  return seqs_by_window.begin();
}

map<junction, BinReadStorage>::iterator WindowsStorage::end() {
  return seqs_by_window.end();
}

23 24
float WindowsStorage::getAverageLength(junction window) {
  assert(hasWindow(window));
25
  return seqs_by_window[window].getAverageLength();
26 27
}

28
Germline *WindowsStorage::getGermline(junction window) {
29 30 31 32
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  if (result == germline_by_window.end())
    return NULL;
  return result->second;
33 34
}

35 36 37 38
size_t WindowsStorage::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

Marc Duez's avatar
Marc Duez committed
39 40
json WindowsStorage::statusToJson(junction window) {
    json result;
41
    
42
    for (unsigned int i=0; i<status_by_window[window].size(); i++){
43 44 45
        if (status_by_window[window][i] !=0){
            ostringstream oss; 
            oss << i;
Marc Duez's avatar
Marc Duez committed
46
            result[oss.str()] = status_by_window[window][i];
47 48 49 50 51 52
        }
    }
    
    return result;
}

53 54
size_t WindowsStorage::getNbReads(junction window) {
  assert(hasWindow(window));
55
  return seqs_by_window[window].getNbInserted();
56 57
}

58
list<Sequence> WindowsStorage::getReads(junction window) {
59
  return seqs_by_window[window].getReads();
Mathieu Giraud's avatar
Mathieu Giraud committed
60 61
}

62
KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction window,
Mathieu Giraud's avatar
Mathieu Giraud committed
63 64
                                           string seed, size_t min_cover, 
                                           float percent_cover,
65
                                           size_t nb_sampled) {
66
  assert(! hasLimitForReadsPerWindow() || nb_sampled <= getMaximalNbReadsPerWindow());
Mathieu Giraud's avatar
Mathieu Giraud committed
67
  list<Sequence> auditioned_sequences 
68
    = getSample(window,nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
69
  KmerRepresentativeComputer repComp(auditioned_sequences, seed);
70
  repComp.setRevcomp(true);
71
  repComp.setMinCover((! isInterestingJunction(window)) ? min_cover : 1);
72
  repComp.setPercentCoverage(percent_cover);
73
  repComp.setRequiredSequence(window);
74
  repComp.setCoverageReferenceLength(getAverageLength(window));
75
  repComp.compute();
76

77
  // We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
78 79
  // - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
  // - and 'min_cover' = min('min_reads_clone', 'max_auditioned')
80
  // - and these sequence are at least as long as the seed
81 82 83
  if (!repComp.hasRepresentative())
    throw invalid_argument("No representative for junction " + window);

84
  return repComp;
Mathieu Giraud's avatar
Mathieu Giraud committed
85 86
}

87
list<Sequence> WindowsStorage::getSample(junction window, size_t nb_sampled) {
88
  return seqs_by_window[window].getBestReads(nb_sampled);
Mathieu Giraud's avatar
Mathieu Giraud committed
89
}
WebTogz's avatar
WebTogz committed
90

91 92 93 94 95 96
set<Germline *> WindowsStorage::getTopGermlines(size_t top, size_t min_reads) {
  assert(sort_all_windows.size() == seqs_by_window.size());

  set<Germline *> top_germlines;
  size_t count = 0;

97
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
98 99 100 101 102 103 104 105
       it != sort_all_windows.end() && count < top && (size_t)it->second >= min_reads;
       ++it, ++count) {
    top_germlines.insert(getGermline(it->first));
  }

  return top_germlines;
}

106 107 108 109
bool WindowsStorage::hasLimitForReadsPerWindow() {
  return max_reads_per_window != (size_t)~0;
}

110 111 112 113 114
bool WindowsStorage::hasWindow(junction window) {
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  return (result != germline_by_window.end());
}

115 116
string WindowsStorage::getLabel(junction window) {

117 118 119 120 121 122 123 124 125 126 127
  bool found = false;
  for (auto it: windows_labels) {
    string sequence_of_interest = it.first;
    if (sequence_of_interest.size() < window.size()) {
      found = window.find(sequence_of_interest) != string::npos
        || window.find(revcomp(sequence_of_interest)) != string::npos;
    } else {
      found = sequence_of_interest.find(window) != string::npos
        || sequence_of_interest.find(revcomp(window)) != string::npos;
    }
    if (found)
128
      return it.second;
129
  }
130 131 132 133 134
  return "";
}

bool WindowsStorage::isInterestingJunction(junction window) {
  return (getLabel(window).length() != 0) ;
135 136
}

Mathieu Giraud's avatar
Mathieu Giraud committed
137 138 139 140
size_t WindowsStorage::size() {
  return seqs_by_window.size();
}

141 142 143 144 145
void WindowsStorage::setBinParameters(size_t nb, size_t max_value) {
  nb_bins = nb;
  max_value_bins = max_value;
}

WebTogz's avatar
WebTogz committed
146 147
void WindowsStorage::setIdToAll() {
    int id = 0;
148
    for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
WebTogz's avatar
WebTogz committed
149 150 151 152 153 154
        it != seqs_by_window.end(); ++it) {
            id_by_window.insert(make_pair(it->first, id));
            id++;
    }
}

155
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline, list<int> extra_statuses) {
156
  if (! hasWindow(window)) {
157
    // First time we see that window: init
158
    status_by_window[window].resize(STATS_SIZE);
159 160
    seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
    seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
161
  }
162 163

  seqs_by_window[window].add(sequence);
164
  status_by_window[window][status]++;
165

166 167 168
  for (int extra: extra_statuses)
    status_by_window[window][extra]++;

169
  germline_by_window[window] = germline;
Mathieu Giraud's avatar
Mathieu Giraud committed
170 171
}

172
pair <int, size_t> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
Mathieu Giraud's avatar
Mathieu Giraud committed
173
  int removes = 0 ;
174
  size_t nb_reads = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
175

176
  for (map <junction, BinReadStorage >::iterator it = seqs_by_window.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
177 178 179
       it != seqs_by_window.end(); ) // We do not advance the iterator here because of the deletion
    {
      junction junc = it->first;
180
      size_t nb_reads_this_window = getNbReads(junc);
181
      // Is it not supported by enough reads?
182
      if (!(nb_reads_this_window >= min_reads_window)
183
          // Is it not a labelled junction?
184
          && ! isInterestingJunction(junc))
Mathieu Giraud's avatar
Mathieu Giraud committed
185
        {
186
          map <junction, BinReadStorage >::iterator toBeDeleted = it;
Mathieu Giraud's avatar
Mathieu Giraud committed
187 188 189 190 191
          it++;
          seqs_by_window.erase(toBeDeleted);
          removes++ ;
        }
      else {
192
        nb_reads += nb_reads_this_window;
Mathieu Giraud's avatar
Mathieu Giraud committed
193 194 195
        it++;
      }
    }
196 197

  sort_all_windows.clear();
Mathieu Giraud's avatar
Mathieu Giraud committed
198 199 200
  return make_pair(removes, nb_reads);
}

201 202 203 204
void WindowsStorage::setMaximalNbReadsPerWindow(size_t max_reads){
  max_reads_per_window = max_reads;
}
  
Mathieu Giraud's avatar
Mathieu Giraud committed
205
void WindowsStorage::sort() {
206
  sort_all_windows.clear();
207
  for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
208
       it != seqs_by_window.end(); ++it)
Mathieu Giraud's avatar
Mathieu Giraud committed
209
    {
210
      sort_all_windows.push_back(make_pair(it->first, it->second.getNbInserted()));
Mathieu Giraud's avatar
Mathieu Giraud committed
211 212 213 214 215 216 217 218
    }

  sort_all_windows.sort(pair_occurrence_sort<junction>);
}

ostream &WindowsStorage::printSortedWindows(ostream &os) {
  int num_seq = 0 ;

219
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
220 221 222 223
       it != sort_all_windows.end(); ++it) 
    {
      num_seq++ ;

224
      windowToStream(os, it->first, num_seq, it->second);
Mathieu Giraud's avatar
Mathieu Giraud committed
225 226 227 228
    }
  return os;
}

229 230 231 232 233 234 235


json WindowsStorage::computeDiversity(int nb_segmented) {

  double index_H_entropy = 0.0 ;
  double index_1_minus_Ds_diversity = 0.0 ;

236 237
  double nb_seg_nb_seg_m1 = (double) nb_segmented * ((double) nb_segmented - 1);

238 239 240 241 242 243
  for (auto it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) {
    size_t clone_nb_reads = it->second.getNbInserted();

    float ratio = (float) clone_nb_reads / nb_segmented ;
    index_H_entropy -= ratio * log(ratio) ;

244
    index_1_minus_Ds_diversity += ((double) clone_nb_reads * ((double) clone_nb_reads - 1)) / nb_seg_nb_seg_m1 ;
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  }

  float index_E_equitability  = index_H_entropy / log(nb_segmented) ;
  float index_Ds_diversity = 1 - index_1_minus_Ds_diversity ;

  cout << "Diversity measures" << endl
       << "  H = " << index_H_entropy << endl        // Shannon's diversity
       << "  E = " << index_E_equitability  << endl  // Shannon's equitability
       << " Ds = " << index_Ds_diversity << endl     // Simpson's diversity
       << endl;

  json jsonDiversity;
  jsonDiversity["index_H_entropy"] = index_H_entropy ;
  jsonDiversity["index_E_equitability"] = index_E_equitability ;
  jsonDiversity["index_Ds_diversity"] = index_Ds_diversity ;

  return jsonDiversity;
}


Marc Duez's avatar
Marc Duez committed
265 266
json WindowsStorage::sortedWindowsToJson(map <junction, json> json_data_segment) {
  json windowsArray;
Mathieu Giraud's avatar
Mathieu Giraud committed
267 268
  int top = 1;
    
Marc Duez's avatar
Marc Duez committed
269
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin(); 
Mathieu Giraud's avatar
Mathieu Giraud committed
270 271
       it != sort_all_windows.end(); ++it) 
    {
Marc Duez's avatar
Marc Duez committed
272 273 274 275 276
       
      json windowsList;

      if (json_data_segment.find(it->first) != json_data_segment.end()){
          windowsList = json_data_segment[it->first];
Mathieu Giraud's avatar
Mathieu Giraud committed
277
      }else{
Marc Duez's avatar
Marc Duez committed
278
          windowsList["sequence"] = 0; //TODO need to compute representative sequence for this case
Mathieu Giraud's avatar
Mathieu Giraud committed
279
      }
Marc Duez's avatar
Marc Duez committed
280 281 282
      
      json reads = {it->second};
      windowsList["id"] = it->first;
283 284 285 286
      if (status_by_window[it->first][SEG_SHORTER_WINDOW])
        windowsList["warn"] = "Short or shifted window";


Marc Duez's avatar
Marc Duez committed
287 288 289 290 291 292
      windowsList["reads"] = reads;
      windowsList["top"] = top++;
      windowsList["germline"] = germline_by_window[it->first]->code;
      windowsList["seg_stat"] = this->statusToJson(it->first);
      
      windowsArray.push_back(windowsList);
Mathieu Giraud's avatar
Mathieu Giraud committed
293 294 295 296 297 298 299
    }

  return windowsArray;
}

ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq, 
                                        size_t size) {
300
  os << ">" << size << "--window--" << num_seq << " " << getLabel(window) << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
301 302 303
  os << window << endl;
  return os;
}