windows.cpp 8.98 KB
Newer Older
1 2
#include <iostream>
#include <string>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4 5 6
#include "tools.h"
#include "windows.h"
#include "representative.h"
#include "sequenceSampler.h"
7
#include "segment.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
8

9 10
WindowsStorage::WindowsStorage(map<string, string> &labels)
  :windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
Mathieu Giraud's avatar
Mathieu Giraud committed
11

12
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
Mathieu Giraud's avatar
Mathieu Giraud committed
13 14 15
  return sort_all_windows;
}

16 17 18 19 20 21 22 23
map<junction, BinReadStorage>::iterator WindowsStorage::begin() {
  return seqs_by_window.begin();
}

map<junction, BinReadStorage>::iterator WindowsStorage::end() {
  return seqs_by_window.end();
}

24 25 26 27 28 29 30 31
string WindowsStorage::getLabel(junction window) {
  
  if (windows_labels.find(window) == windows_labels.end())
    return "" ;
  
  return windows_labels[window];   
}

32 33
float WindowsStorage::getAverageLength(junction window) {
  assert(hasWindow(window));
34
  return seqs_by_window[window].getAverageScore();
35 36
}

37
Germline *WindowsStorage::getGermline(junction window) {
38 39 40 41
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  if (result == germline_by_window.end())
    return NULL;
  return result->second;
42 43
}

44 45 46 47
size_t WindowsStorage::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

Marc Duez's avatar
Marc Duez committed
48 49
json WindowsStorage::statusToJson(junction window) {
    json result;
50
    
51
    for (unsigned int i=0; i<status_by_window[window].size(); i++){
52 53 54
        if (status_by_window[window][i] !=0){
            ostringstream oss; 
            oss << i;
Marc Duez's avatar
Marc Duez committed
55
            result[oss.str()] = status_by_window[window][i];
56 57 58 59 60 61
        }
    }
    
    return result;
}

62 63
size_t WindowsStorage::getNbReads(junction window) {
  assert(hasWindow(window));
64
  return seqs_by_window[window].getNbInserted();
65 66
}

67
list<Sequence> WindowsStorage::getReads(junction window) {
68
  return seqs_by_window[window].getReads();
Mathieu Giraud's avatar
Mathieu Giraud committed
69 70
}

71
KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction window,
Mathieu Giraud's avatar
Mathieu Giraud committed
72 73 74 75
                                           string seed, size_t min_cover, 
                                           float percent_cover,
                                           size_t nb_sampled, 
                                           size_t nb_buckets) {
76
  assert(! hasLimitForReadsPerWindow() || nb_sampled <= getMaximalNbReadsPerWindow());
Mathieu Giraud's avatar
Mathieu Giraud committed
77 78 79
  list<Sequence> auditioned_sequences 
    = getSample(window,nb_sampled, nb_buckets);
  KmerRepresentativeComputer repComp(auditioned_sequences, seed);
80
  repComp.setRevcomp(true);
81
  repComp.setMinCover((windows_labels.find(window) == windows_labels.end()) ? min_cover : 1);
82
  repComp.setPercentCoverage(percent_cover);
83
  repComp.setRequiredSequence(window);
84
  repComp.setCoverageReferenceLength(getAverageLength(window));
85
  repComp.compute();
86

87
  // We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
88 89
  // - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
  // - and 'min_cover' = min('min_reads_clone', 'max_auditioned')
90
  // - and these sequence are at least as long as the seed
91 92 93
  if (!repComp.hasRepresentative())
    throw invalid_argument("No representative for junction " + window);

94
  return repComp;
Mathieu Giraud's avatar
Mathieu Giraud committed
95 96 97 98
}

list<Sequence> WindowsStorage::getSample(junction window, size_t nb_sampled,
                                         size_t nb_buckets) {
99
  list<Sequence> reads = getReads(window);
100 101
  if (reads.size() <= nb_sampled)
    return reads;
Mathieu Giraud's avatar
Mathieu Giraud committed
102 103
  return SequenceSampler(reads).getLongest(nb_sampled, nb_buckets);
}
WebTogz's avatar
WebTogz committed
104

105 106 107 108 109 110
set<Germline *> WindowsStorage::getTopGermlines(size_t top, size_t min_reads) {
  assert(sort_all_windows.size() == seqs_by_window.size());

  set<Germline *> top_germlines;
  size_t count = 0;

111
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
112 113 114 115 116 117 118 119
       it != sort_all_windows.end() && count < top && (size_t)it->second >= min_reads;
       ++it, ++count) {
    top_germlines.insert(getGermline(it->first));
  }

  return top_germlines;
}

120 121 122 123
bool WindowsStorage::hasLimitForReadsPerWindow() {
  return max_reads_per_window != (size_t)~0;
}

124 125 126 127 128
bool WindowsStorage::hasWindow(junction window) {
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  return (result != germline_by_window.end());
}

Mathieu Giraud's avatar
Mathieu Giraud committed
129 130 131 132
size_t WindowsStorage::size() {
  return seqs_by_window.size();
}

133 134 135 136 137
void WindowsStorage::setBinParameters(size_t nb, size_t max_value) {
  nb_bins = nb;
  max_value_bins = max_value;
}

WebTogz's avatar
WebTogz committed
138 139
void WindowsStorage::setIdToAll() {
    int id = 0;
140
    for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
WebTogz's avatar
WebTogz committed
141 142 143 144 145 146
        it != seqs_by_window.end(); ++it) {
            id_by_window.insert(make_pair(it->first, id));
            id++;
    }
}

147
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline) {
148
  if (! hasWindow(window)) {
149
    // First time we see that window: init
150
    status_by_window[window].resize(STATS_SIZE);
151 152
    seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
    seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
153
  }
154 155

  seqs_by_window[window].add(sequence);
156
  status_by_window[window][status]++;
157 158

  germline_by_window[window] = germline;
Mathieu Giraud's avatar
Mathieu Giraud committed
159 160
}

161
pair <int, size_t> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
Mathieu Giraud's avatar
Mathieu Giraud committed
162
  int removes = 0 ;
163
  size_t nb_reads = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
164

165
  for (map <junction, BinReadStorage >::iterator it = seqs_by_window.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
166 167 168
       it != seqs_by_window.end(); ) // We do not advance the iterator here because of the deletion
    {
      junction junc = it->first;
169
      size_t nb_reads_this_window = getNbReads(junc);
170
      // Is it not supported by enough reads?
171
      if (!(nb_reads_this_window >= min_reads_window)
172
          // Is it not a labelled junction?
173
          && (windows_labels.find(junc) == windows_labels.end()))
Mathieu Giraud's avatar
Mathieu Giraud committed
174
        {
175
          map <junction, BinReadStorage >::iterator toBeDeleted = it;
Mathieu Giraud's avatar
Mathieu Giraud committed
176 177 178 179 180
          it++;
          seqs_by_window.erase(toBeDeleted);
          removes++ ;
        }
      else {
181
        nb_reads += nb_reads_this_window;
Mathieu Giraud's avatar
Mathieu Giraud committed
182 183 184
        it++;
      }
    }
185 186

  sort_all_windows.clear();
Mathieu Giraud's avatar
Mathieu Giraud committed
187 188 189
  return make_pair(removes, nb_reads);
}

190 191 192 193
void WindowsStorage::setMaximalNbReadsPerWindow(size_t max_reads){
  max_reads_per_window = max_reads;
}
  
Mathieu Giraud's avatar
Mathieu Giraud committed
194
void WindowsStorage::sort() {
195
  sort_all_windows.clear();
196
  for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
197
       it != seqs_by_window.end(); ++it)
Mathieu Giraud's avatar
Mathieu Giraud committed
198
    {
199
      sort_all_windows.push_back(make_pair(it->first, it->second.getNbInserted()));
Mathieu Giraud's avatar
Mathieu Giraud committed
200 201 202 203 204 205 206 207
    }

  sort_all_windows.sort(pair_occurrence_sort<junction>);
}

ostream &WindowsStorage::printSortedWindows(ostream &os) {
  int num_seq = 0 ;

208
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
209 210 211 212
       it != sort_all_windows.end(); ++it) 
    {
      num_seq++ ;

213
      windowToStream(os, it->first, num_seq, it->second);
Mathieu Giraud's avatar
Mathieu Giraud committed
214 215 216 217
    }
  return os;
}

218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251


json WindowsStorage::computeDiversity(int nb_segmented) {

  double index_H_entropy = 0.0 ;
  double index_1_minus_Ds_diversity = 0.0 ;

  for (auto it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) {
    size_t clone_nb_reads = it->second.getNbInserted();

    float ratio = (float) clone_nb_reads / nb_segmented ;
    index_H_entropy -= ratio * log(ratio) ;

    index_1_minus_Ds_diversity += (float) (clone_nb_reads * (clone_nb_reads - 1)) / (nb_segmented * (nb_segmented - 1)) ;
  }

  float index_E_equitability  = index_H_entropy / log(nb_segmented) ;
  float index_Ds_diversity = 1 - index_1_minus_Ds_diversity ;

  cout << "Diversity measures" << endl
       << "  H = " << index_H_entropy << endl        // Shannon's diversity
       << "  E = " << index_E_equitability  << endl  // Shannon's equitability
       << " Ds = " << index_Ds_diversity << endl     // Simpson's diversity
       << endl;

  json jsonDiversity;
  jsonDiversity["index_H_entropy"] = index_H_entropy ;
  jsonDiversity["index_E_equitability"] = index_E_equitability ;
  jsonDiversity["index_Ds_diversity"] = index_Ds_diversity ;

  return jsonDiversity;
}


Marc Duez's avatar
Marc Duez committed
252 253
json WindowsStorage::sortedWindowsToJson(map <junction, json> json_data_segment) {
  json windowsArray;
Mathieu Giraud's avatar
Mathieu Giraud committed
254 255
  int top = 1;
    
Marc Duez's avatar
Marc Duez committed
256
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin(); 
Mathieu Giraud's avatar
Mathieu Giraud committed
257 258
       it != sort_all_windows.end(); ++it) 
    {
Marc Duez's avatar
Marc Duez committed
259 260 261 262 263
       
      json windowsList;

      if (json_data_segment.find(it->first) != json_data_segment.end()){
          windowsList = json_data_segment[it->first];
Mathieu Giraud's avatar
Mathieu Giraud committed
264
      }else{
Marc Duez's avatar
Marc Duez committed
265
          windowsList["sequence"] = 0; //TODO need to compute representative sequence for this case
Mathieu Giraud's avatar
Mathieu Giraud committed
266
      }
Marc Duez's avatar
Marc Duez committed
267 268 269 270 271 272 273 274 275
      
      json reads = {it->second};
      windowsList["id"] = it->first;
      windowsList["reads"] = reads;
      windowsList["top"] = top++;
      windowsList["germline"] = germline_by_window[it->first]->code;
      windowsList["seg_stat"] = this->statusToJson(it->first);
      
      windowsArray.push_back(windowsList);
Mathieu Giraud's avatar
Mathieu Giraud committed
276 277 278 279 280 281 282
    }

  return windowsArray;
}

ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq, 
                                        size_t size) {
283
  os << ">" << size << "--window--" << num_seq << " " << getLabel(window) << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
284 285 286
  os << window << endl;
  return os;
}