windowExtractor.cpp 8.65 KB
Newer Older
Mathieu Giraud's avatar
Mathieu Giraud committed
1 2
#include "windowExtractor.h"
#include "segment.h"
3
#include "tools.h"
4

5 6
WindowExtractor::WindowExtractor(MultiGermline *multigermline): out_segmented(NULL), out_unsegmented(NULL), out_unsegmented_detail(NULL), out_affects(NULL),
                                                                max_reads_per_window(~0), multigermline(multigermline){
7 8 9 10 11 12 13
    for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;
      stats_reads[germline->code].init(NB_BINS, MAX_VALUE_BINS, NULL, true);
      stats_reads[germline->code].setLabel(germline->code);
      stats_clones[germline->code].init(NB_BINS_CLONES, MAX_VALUE_BINS_CLONES, NULL, true);
    }
14
}
Mathieu Giraud's avatar
Mathieu Giraud committed
15
                                    
16
WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
17
					 size_t w,
18
                                         map<string, string> &windows_labels, bool only_labeled_windows,
19
                                         bool keep_unsegmented_as_clone,
20
                                         double nb_expected, int nb_reads_for_evalue,
21 22
                                         VirtualReadScore *scorer,
                                         SampleOutput *output) {
Mathieu Giraud's avatar
Mathieu Giraud committed
23 24 25
  init_stats();

  WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
26
  windowsStorage->setScorer(scorer);
27
  windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
28

29
  unsigned long long int bp_total = 0;
30

31 32 33
  global_interrupted = false ;
  signal(SIGINT, sigintHandler);

34
  while (reads->hasNext()) {
35

36 37
    if (global_interrupted)
    {
38 39 40
      string msg = "Interrupted after processing " + string_of_int(nb_reads) + " reads" ;
      if (output) output->add_warning("W09", msg, LEVEL_WARN);
      cout << WARNING_STRING << msg << endl ;
41 42 43
      break;
    }

44 45 46
    try {
      reads->next();
    }
47
    catch (const invalid_argument &e) {
48 49 50 51 52 53
      cout << endl;
      cerr << WARNING_STRING << "Error in getting a new read: " << e.what() << endl;
      cerr << WARNING_STRING << "Vidjil stops the analysis here, after " << nb_reads << " reads." << endl;
      break ;
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
54
    nb_reads++;
55

56 57
    if (out_affects) {
      *out_affects << reads->getSequence();
58
    }
59
    
60
    KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected, nb_reads_for_evalue);
61
    
62
    KmerSegmenter *seg = kmseg.the_kseg ;
63 64 65 66 67 68 69 70 71 72

    // Window length threshold
    junction junc ;
    if (seg->isSegmented()) {
      junc = seg->getJunction(w);
      if (!junc.size()) {
        seg->setSegmentationStatus(UNSEG_TOO_SHORT_FOR_WINDOW);
      }
    }

73
    int read_length = seg->getSequence().sequence.length();
Mathieu Giraud's avatar
Mathieu Giraud committed
74

75
    // Update stats
76
    stats[seg->getSegmentationStatus()].insert(read_length);
77

78
    if (seg->isSegmented()) {
79 80

      // Filter
81
      if (!only_labeled_windows || windowsStorage->isInterestingJunction(junc)) {
82

83
        // Store the window
84 85
        if (seg->isJunctionChanged())
          windowsStorage->add(junc, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline, {SEG_CHANGED_WINDOW});
86 87 88
        else
          windowsStorage->add(junc, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline);
      }
89

90 91
      // Update stats
      stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
92 93
      if (seg->isJunctionChanged())
        stats[SEG_CHANGED_WINDOW].insert(read_length);
94
      stats_reads[seg->segmented_germline->code].addScore(read_length);
Mathieu Giraud's avatar
Mathieu Giraud committed
95

96
      if (out_segmented) {
97
        *out_segmented << *seg ; // KmerSegmenter output (V/N/J)
98
      }
99 100 101 102 103
    } else {
      if (keep_unsegmented_as_clone && (reads->getSequence().sequence.length() >= w))
        {
          // Keep the unsegmented read, taking the full sequence as the junction
          windowsStorage->add(reads->getSequence().sequence, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline);
104
          stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ; // TODO: rather count that in a pseudo-germline such as 'TRG!'
105 106 107
        }

      if (out_unsegmented) {
108
        *out_unsegmented << *seg ;
109
      }
110
      if (out_unsegmented_detail && (seg->getSegmentationStatus() >= STATS_FIRST_UNSEG)) {
111
        if (unsegmented_detail_full || (seg->getSegmentationStatus() != UNSEG_TOO_FEW_ZERO && seg->getSegmentationStatus() != UNSEG_TOO_SHORT))
112 113
        *out_unsegmented_detail[seg->getSegmentationStatus()] << *seg ;
      }
114 115 116 117
    }

    // Last line of detailed affects output
    if (out_affects) {
118
      *out_affects << "#>" << seg->label << " " <<  seg->getInfoLine() << endl << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
119
    }
120 121 122 123 124 125 126 127 128

    // Progress bar
    bp_total += read_length;

    if (!(nb_reads % PROGRESS_POINT))
      {
	cout << "." ;

	if (!(nb_reads % (PROGRESS_POINT * PROGRESS_LINE)))
129
	  cout << setw(10) << nb_reads / 1000 << "k reads " << fixed << setprecision(2) << setw(14) << bp_total / 1E6 << " Mbp" << endl ;
130 131 132

	cout.flush() ;
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
133
  }
134
  signal(SIGINT, SIG_DFL);
135 136 137

  cout << endl ;

138 139
  fillStatsClones(windowsStorage);

Mathieu Giraud's avatar
Mathieu Giraud committed
140 141 142 143
  return windowsStorage;
}

float WindowExtractor::getAverageSegmentationLength(SEGMENTED seg) {
144
  return stats[seg].getAverage();
Mathieu Giraud's avatar
Mathieu Giraud committed
145 146
}

147 148 149 150
size_t WindowExtractor::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

Mathieu Giraud's avatar
Mathieu Giraud committed
151 152 153 154 155
size_t WindowExtractor::getNbReads() {
  return nb_reads;
}

size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
156
  return stats[seg].nb;
Mathieu Giraud's avatar
Mathieu Giraud committed
157 158
}

159
size_t WindowExtractor::getNbReadsGermline(string germline) {
160
  return stats_reads[germline].getNbScores();
161 162
}

163 164 165 166
void WindowExtractor::setMaximalNbReadsPerWindow(size_t max_reads) {
  max_reads_per_window = max_reads;
}

Mathieu Giraud's avatar
Mathieu Giraud committed
167 168 169 170 171 172 173 174
void WindowExtractor::setSegmentedOutput(ostream *out) {
  out_segmented = out;
}

void WindowExtractor::setUnsegmentedOutput(ostream *out) {
  out_unsegmented = out;
}

175
void WindowExtractor::setUnsegmentedDetailOutput(ofstream **outs, bool unsegmented_detail_full) {
176
  out_unsegmented_detail = outs;
177
  this->unsegmented_detail_full = unsegmented_detail_full;
178 179
}

180 181 182 183
void WindowExtractor::setAffectsOutput(ostream *out) {
  out_affects = out;
}

184 185 186 187 188 189 190 191 192 193 194 195 196 197
void WindowExtractor::fillStatsClones(WindowsStorage *storage)
{
  for (map <junction, BinReadStorage >::iterator it = storage->begin();
       it != storage->end();
       it++)
    {
      junction junc = it->first;
      int nb_reads = it->second.getNbInserted();
      Germline *germline = storage->getGermline(junc);

      stats_clones[germline->code].addScore(nb_reads);
    }
}

Mathieu Giraud's avatar
Mathieu Giraud committed
198 199
void WindowExtractor::init_stats() {
  for (int i = 0; i < STATS_SIZE; i++) {
200
    stats[i].label = segmented_mesg[i];
Mathieu Giraud's avatar
Mathieu Giraud committed
201 202 203
  }
  nb_reads = 0;
}
204 205 206

void WindowExtractor::out_stats(ostream &out)
{
207 208 209 210 211 212
  out_stats_germlines(out);
  out << endl;
  out_stats_segmentation(out);
}

void WindowExtractor::out_stats_segmentation(ostream &out) {
213 214
  for (int i=0; i<STATS_SIZE; i++)
    {
215 216 217 218 219 220
      // stats[NOT_PROCESSED] should equal to 0
      if (i == NOT_PROCESSED && (!stats[i].nb))
        continue;

      // Pretty-print
      if (i == UNSEG_TOO_SHORT)
221
	out << endl;
222
      out << stats[i] << endl ;
223 224
    }
}
225 226 227

void WindowExtractor::out_stats_germlines(ostream &out) {
  out << "                          " ;
228
  out << "reads av. len     clones clo/rds" ;
229 230 231 232 233
  out << endl ;

  for (map<string,BinReadStorage>::iterator it = stats_reads.begin(); it != stats_reads.end(); ++it)
    {
      stats_reads[it->first].out_average_scores(out);
234
      stats_clones[it->first].out_average_scores(out, true);
235 236 237 238
      out << endl ;
    }

}
239 240 241 242 243

pair<int, int> WindowExtractor::get_best_length_shifts(size_t read_length,
                                                       size_t max_window_length,
                                                       int central_pos,
                                                       int shift) {
244 245 246
  if (central_pos < 0 || central_pos >= (int) read_length)
    return make_pair(0, 0);

247 248 249 250 251
  int constraint_left = 2 * central_pos + 1;
  int constraint_right = (read_length - central_pos) * 2;
  int best_length = min(constraint_left, constraint_right);
  int best_shift = 0;

252 253 254
  if (best_length == (int)max_window_length)
    return make_pair(best_length, 0);

255 256
  best_length = best_length / shift * shift;

257
  list<int> shifts {-1, 1, -2, 2};
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
  for (int current_shift : shifts) { // -1 will be a left shift
    int shifted_constraint_left = constraint_left + current_shift * shift * 2;
    int shifted_constraint_right = constraint_right - current_shift * shift * 2;

    shifted_constraint_left = shifted_constraint_left / shift * shift;
    shifted_constraint_right = shifted_constraint_right / shift * shift;

    int current_length = min(shifted_constraint_left, shifted_constraint_right);
    if (current_length > best_length && best_length < (int)max_window_length) {
      best_length = current_length;
      best_shift = current_shift;
    }
  }

  return make_pair(min((int)max_window_length, best_length), best_shift * shift);
}