windowExtractor.cpp 4.7 KB
Newer Older
Mathieu Giraud's avatar
Mathieu Giraud committed
1 2 3
#include "windowExtractor.h"
#include "segment.h"

4
// Progress bar
5 6
#define PROGRESS_POINT 25000
#define PROGRESS_LINE 40
7

8 9
WindowExtractor::WindowExtractor(MultiGermline *multigermline): out_segmented(NULL), out_unsegmented(NULL), out_affects(NULL), max_reads_per_window(~0), multigermline(multigermline){
}
Mathieu Giraud's avatar
Mathieu Giraud committed
10
                                    
11
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads,
12
					 size_t w,
13
                                         map<string, string> &windows_labels, bool only_labeled_windows,
14
                                         int stop_after, int only_nth_read, bool keep_unsegmented_as_clone,
15
                                         double nb_expected, int nb_reads_for_evalue) {
Mathieu Giraud's avatar
Mathieu Giraud committed
16 17 18
  init_stats();

  WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
19
  windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window);
20 21 22 23 24
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;
      nb_reads_germline[germline->code] = 0;
    }
25

26
  int nb_reads_all = 0;
27
  unsigned long long int bp_total = 0;
28 29

  while (reads->hasNext() && (int) nb_reads != stop_after) {
Mathieu Giraud's avatar
Mathieu Giraud committed
30
    reads->next();
31 32 33 34 35
    nb_reads_all++;

    if (nb_reads_all % only_nth_read)
      continue ;

Mathieu Giraud's avatar
Mathieu Giraud committed
36
    nb_reads++;
37

38 39
    if (out_affects) {
      *out_affects << reads->getSequence();
40
    }
41
    
42
    KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected, nb_reads_for_evalue);
43
    
44
    KmerSegmenter *seg = kmseg.the_kseg ;
45 46 47 48 49 50 51 52 53 54

    // Window length threshold
    junction junc ;
    if (seg->isSegmented()) {
      junc = seg->getJunction(w);
      if (!junc.size()) {
        seg->setSegmentationStatus(UNSEG_TOO_SHORT_FOR_WINDOW);
      }
    }

55
    int read_length = seg->getSequence().sequence.length();
Mathieu Giraud's avatar
Mathieu Giraud committed
56

57
    // Update stats
58
    stats[seg->getSegmentationStatus()].insert(read_length);
59

60
    if (seg->isSegmented()) {
61 62 63 64

      // Filter
      if (!only_labeled_windows || (windows_labels.find(junc) != windows_labels.end()))

65 66
      // Store the window
      windowsStorage->add(junc, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline);
67

68
      // Update stats
69
      seg->segmented_germline->stats_reads.insert(read_length);
70 71
      stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
      nb_reads_germline[seg->system]++;
Mathieu Giraud's avatar
Mathieu Giraud committed
72

73
      if (out_segmented) {
74
        *out_segmented << *seg ; // KmerSegmenter output (V/N/J)
75
      }
76 77 78 79 80
    } else {
      if (keep_unsegmented_as_clone && (reads->getSequence().sequence.length() >= w))
        {
          // Keep the unsegmented read, taking the full sequence as the junction
          windowsStorage->add(reads->getSequence().sequence, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline);
81
          stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ; // TODO: rather count that in a pseudo-germline such as 'TRG!'
82 83 84
        }

      if (out_unsegmented) {
85
        *out_unsegmented << *seg ;
86
      }
87 88 89 90
    }

    // Last line of detailed affects output
    if (out_affects) {
91
      *out_affects << "#>" << seg->label << " " <<  seg->getInfoLine() << endl << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
92
    }
93 94 95 96 97 98 99 100 101

    // Progress bar
    bp_total += read_length;

    if (!(nb_reads % PROGRESS_POINT))
      {
	cout << "." ;

	if (!(nb_reads % (PROGRESS_POINT * PROGRESS_LINE)))
102
	  cout << setw(10) << nb_reads / 1000 << "k reads " << fixed << setprecision(2) << setw(14) << bp_total / 1E6 << " Mbp" << endl ;
103 104 105

	cout.flush() ;
      }
Mathieu Giraud's avatar
Mathieu Giraud committed
106
  }
107 108 109

  cout << endl ;

Mathieu Giraud's avatar
Mathieu Giraud committed
110 111 112 113
  return windowsStorage;
}

float WindowExtractor::getAverageSegmentationLength(SEGMENTED seg) {
114
  return stats[seg].getAverage();
Mathieu Giraud's avatar
Mathieu Giraud committed
115 116
}

117 118 119 120
size_t WindowExtractor::getMaximalNbReadsPerWindow() {
  return max_reads_per_window;
}

Mathieu Giraud's avatar
Mathieu Giraud committed
121 122 123 124 125
size_t WindowExtractor::getNbReads() {
  return nb_reads;
}

size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
126
  return stats[seg].nb;
Mathieu Giraud's avatar
Mathieu Giraud committed
127 128
}

129 130 131 132
size_t WindowExtractor::getNbReadsGermline(string germline) {
  return nb_reads_germline[germline];
}

133 134 135 136
void WindowExtractor::setMaximalNbReadsPerWindow(size_t max_reads) {
  max_reads_per_window = max_reads;
}

Mathieu Giraud's avatar
Mathieu Giraud committed
137 138 139 140 141 142 143 144
void WindowExtractor::setSegmentedOutput(ostream *out) {
  out_segmented = out;
}

void WindowExtractor::setUnsegmentedOutput(ostream *out) {
  out_unsegmented = out;
}

145 146 147 148
void WindowExtractor::setAffectsOutput(ostream *out) {
  out_affects = out;
}

Mathieu Giraud's avatar
Mathieu Giraud committed
149 150
void WindowExtractor::init_stats() {
  for (int i = 0; i < STATS_SIZE; i++) {
151
    stats[i].label = segmented_mesg[i];
Mathieu Giraud's avatar
Mathieu Giraud committed
152 153 154
  }
  nb_reads = 0;
}
155 156 157 158 159

void WindowExtractor::out_stats(ostream &out)
{
  for (int i=0; i<STATS_SIZE; i++)
    {
160 161 162 163 164 165
      // stats[NOT_PROCESSED] should equal to 0
      if (i == NOT_PROCESSED && (!stats[i].nb))
        continue;

      // Pretty-print
      if (i == UNSEG_TOO_SHORT)
166
	out << endl;
167
      out << stats[i] << endl ;
168 169
    }
}