windows.cpp 8.1 KB
Newer Older
1 2
#include <iostream>
#include <string>
Mathieu Giraud's avatar
Mathieu Giraud committed
3 4 5 6 7
#include "tools.h"
#include "json.h"
#include "windows.h"
#include "representative.h"
#include "sequenceSampler.h"
8 9
#include "segment.h"
#include "json.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
10

11 12
WindowsStorage::WindowsStorage(map<string, string> &labels)
  :windows_labels(labels),max_reads_per_window(~0),nb_bins(NB_BINS),max_value_bins(MAX_VALUE_BINS) {}
Mathieu Giraud's avatar
Mathieu Giraud committed
13

14
list<pair <junction, size_t> > &WindowsStorage::getSortedList() {
Mathieu Giraud's avatar
Mathieu Giraud committed
15 16 17
  return sort_all_windows;
}

18 19 20 21 22 23 24 25
map<junction, BinReadStorage>::iterator WindowsStorage::begin() {
  return seqs_by_window.begin();
}

map<junction, BinReadStorage>::iterator WindowsStorage::end() {
  return seqs_by_window.end();
}

26 27 28 29 30 31 32 33
string WindowsStorage::getLabel(junction window) {
  
  if (windows_labels.find(window) == windows_labels.end())
    return "" ;
  
  return windows_labels[window];   
}

34 35
float WindowsStorage::getAverageLength(junction window) {
  assert(hasWindow(window));
36
  return seqs_by_window[window].getAverageScore();
37 38
}

39
Germline *WindowsStorage::getGermline(junction window) {
40 41 42 43
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  if (result == germline_by_window.end())
    return NULL;
  return result->second;
44 45
}

46 47 48 49
size_t WindowsStorage::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

50 51 52
JsonList WindowsStorage::statusToJson(junction window) {
    JsonList result;
    
53
    for (unsigned int i=0; i<status_by_window[window].size(); i++){
54 55 56 57 58 59 60 61 62 63
        if (status_by_window[window][i] !=0){
            ostringstream oss; 
            oss << i;
            result.add(oss.str(), status_by_window[window][i]);
        }
    }
    
    return result;
}

64 65
size_t WindowsStorage::getNbReads(junction window) {
  assert(hasWindow(window));
66
  return seqs_by_window[window].getNbInserted();
67 68
}

69
list<Sequence> WindowsStorage::getReads(junction window) {
70
  return seqs_by_window[window].getReads();
Mathieu Giraud's avatar
Mathieu Giraud committed
71 72
}

73
KmerRepresentativeComputer WindowsStorage::getRepresentativeComputer(junction window,
Mathieu Giraud's avatar
Mathieu Giraud committed
74 75 76 77
                                           string seed, size_t min_cover, 
                                           float percent_cover,
                                           size_t nb_sampled, 
                                           size_t nb_buckets) {
78
  assert(! hasLimitForReadsPerWindow() || nb_sampled <= getMaximalNbReadsPerWindow());
Mathieu Giraud's avatar
Mathieu Giraud committed
79 80 81
  list<Sequence> auditioned_sequences 
    = getSample(window,nb_sampled, nb_buckets);
  KmerRepresentativeComputer repComp(auditioned_sequences, seed);
82
  repComp.setRevcomp(true);
83
  repComp.setMinCover((windows_labels.find(window) == windows_labels.end()) ? min_cover : 1);
84
  repComp.setPercentCoverage(percent_cover);
85
  repComp.setRequiredSequence(window);
86
  repComp.setCoverageReferenceLength(getAverageLength(window));
87
  repComp.compute();
88

89
  // We should always have a representative, because either the junction is labelled (thus setMinCover(1)), or:
90 91
  // - there is at least min('min_reads_clone', 'max_auditioned') sequences in auditioned_sequences
  // - and 'min_cover' = min('min_reads_clone', 'max_auditioned')
92
  // - and these sequence are at least as long as the seed
93 94 95
  if (!repComp.hasRepresentative())
    throw invalid_argument("No representative for junction " + window);

96
  return repComp;
Mathieu Giraud's avatar
Mathieu Giraud committed
97 98 99 100
}

list<Sequence> WindowsStorage::getSample(junction window, size_t nb_sampled,
                                         size_t nb_buckets) {
101
  list<Sequence> reads = getReads(window);
102 103
  if (reads.size() <= nb_sampled)
    return reads;
Mathieu Giraud's avatar
Mathieu Giraud committed
104 105
  return SequenceSampler(reads).getLongest(nb_sampled, nb_buckets);
}
WebTogz's avatar
WebTogz committed
106

107 108 109 110 111 112
set<Germline *> WindowsStorage::getTopGermlines(size_t top, size_t min_reads) {
  assert(sort_all_windows.size() == seqs_by_window.size());

  set<Germline *> top_germlines;
  size_t count = 0;

113
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
114 115 116 117 118 119 120 121
       it != sort_all_windows.end() && count < top && (size_t)it->second >= min_reads;
       ++it, ++count) {
    top_germlines.insert(getGermline(it->first));
  }

  return top_germlines;
}

122 123 124 125
bool WindowsStorage::hasLimitForReadsPerWindow() {
  return max_reads_per_window != (size_t)~0;
}

126 127 128 129 130
bool WindowsStorage::hasWindow(junction window) {
  map<junction, Germline *>::iterator result = germline_by_window.find(window);
  return (result != germline_by_window.end());
}

Mathieu Giraud's avatar
Mathieu Giraud committed
131 132 133 134
size_t WindowsStorage::size() {
  return seqs_by_window.size();
}

135 136 137 138 139
void WindowsStorage::setBinParameters(size_t nb, size_t max_value) {
  nb_bins = nb;
  max_value_bins = max_value;
}

WebTogz's avatar
WebTogz committed
140 141
void WindowsStorage::setIdToAll() {
    int id = 0;
142
    for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
WebTogz's avatar
WebTogz committed
143 144 145 146 147 148
        it != seqs_by_window.end(); ++it) {
            id_by_window.insert(make_pair(it->first, id));
            id++;
    }
}

149
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline) {
150
  if (! hasWindow(window)) {
151
    // First time we see that window: init
152
    status_by_window[window].resize(STATS_SIZE);
153 154
    seqs_by_window[window].init(nb_bins, max_value_bins, &scorer);
    seqs_by_window[window].setMaxNbReadsStored(getMaximalNbReadsPerWindow());
155
  }
156 157

  seqs_by_window[window].add(sequence);
158
  status_by_window[window][status]++;
159 160

  germline_by_window[window] = germline;
Mathieu Giraud's avatar
Mathieu Giraud committed
161 162
}

163
pair <int, size_t> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
Mathieu Giraud's avatar
Mathieu Giraud committed
164
  int removes = 0 ;
165
  size_t nb_reads = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
166

167
  for (map <junction, BinReadStorage >::iterator it = seqs_by_window.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
168 169 170
       it != seqs_by_window.end(); ) // We do not advance the iterator here because of the deletion
    {
      junction junc = it->first;
171
      size_t nb_reads_this_window = getNbReads(junc);
172
      // Is it not supported by enough reads?
173
      if (!(nb_reads_this_window >= min_reads_window)
174
          // Is it not a labelled junction?
175
          && (windows_labels.find(junc) == windows_labels.end()))
Mathieu Giraud's avatar
Mathieu Giraud committed
176
        {
177
          map <junction, BinReadStorage >::iterator toBeDeleted = it;
Mathieu Giraud's avatar
Mathieu Giraud committed
178 179 180 181 182
          it++;
          seqs_by_window.erase(toBeDeleted);
          removes++ ;
        }
      else {
183
        nb_reads += nb_reads_this_window;
Mathieu Giraud's avatar
Mathieu Giraud committed
184 185 186
        it++;
      }
    }
187 188

  sort_all_windows.clear();
Mathieu Giraud's avatar
Mathieu Giraud committed
189 190 191
  return make_pair(removes, nb_reads);
}

192 193 194 195
void WindowsStorage::setMaximalNbReadsPerWindow(size_t max_reads){
  max_reads_per_window = max_reads;
}
  
Mathieu Giraud's avatar
Mathieu Giraud committed
196
void WindowsStorage::sort() {
197
  sort_all_windows.clear();
198
  for (map <junction, BinReadStorage >::const_iterator it = seqs_by_window.begin();
199
       it != seqs_by_window.end(); ++it)
Mathieu Giraud's avatar
Mathieu Giraud committed
200
    {
201
      sort_all_windows.push_back(make_pair(it->first, it->second.getNbInserted()));
Mathieu Giraud's avatar
Mathieu Giraud committed
202 203 204 205 206 207 208 209
    }

  sort_all_windows.sort(pair_occurrence_sort<junction>);
}

ostream &WindowsStorage::printSortedWindows(ostream &os) {
  int num_seq = 0 ;

210
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
211 212 213 214
       it != sort_all_windows.end(); ++it) 
    {
      num_seq++ ;

215
      windowToStream(os, it->first, num_seq, it->second);
Mathieu Giraud's avatar
Mathieu Giraud committed
216 217 218 219
    }
  return os;
}

220
JsonArray WindowsStorage::sortedWindowsToJsonArray(map <junction, JsonList> json_data_segment) {
Mathieu Giraud's avatar
Mathieu Giraud committed
221 222 223
  JsonArray windowsArray;
  int top = 1;
    
224
  for (list<pair <junction, size_t> >::const_iterator it = sort_all_windows.begin();
Mathieu Giraud's avatar
Mathieu Giraud committed
225 226 227 228
       it != sort_all_windows.end(); ++it) 
    {
	   
      JsonList windowsList;
229
      JsonArray json_reads;
230
      JsonArray json_seg;
231
      json_reads.add(it->second);
Mathieu Giraud's avatar
Mathieu Giraud committed
232 233 234 235 236 237

	  if (json_data_segment.find(it->first) != json_data_segment.end()){
          windowsList.concat(json_data_segment[it->first]);
      }else{
          windowsList.add("sequence", 0); //TODO need to compute representative sequence for this case
      }
238 239
      windowsList.add("id", it->first);
      windowsList.add("reads", json_reads);
Mathieu Giraud's avatar
Mathieu Giraud committed
240
      windowsList.add("top", top++);
241
      //windowsList.add("id", this->getId(it->first));
242
      JsonList seg_stat = this->statusToJson(it->first);
243
      json_seg.add(seg_stat);
244
      windowsList.add("germline", germline_by_window[it->first]->code);
245
      windowsList.add("seg_stat", json_seg);
Mathieu Giraud's avatar
Mathieu Giraud committed
246 247 248 249 250 251 252 253
      windowsArray.add(windowsList);
    }

  return windowsArray;
}

ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq, 
                                        size_t size) {
254
  os << ">" << size << "--window--" << num_seq << " " << getLabel(window) << endl ;
Mathieu Giraud's avatar
Mathieu Giraud committed
255 256 257
  os << window << endl;
  return os;
}