segment.h 6.44 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
2
3
4
5
6
7
8
#ifndef SEGMENT_H
#define SEGMENT_H

#include <string>
#include <iostream>
#include "fasta.h"
#include "dynprog.h"
#include "tools.h"
9
#include "germline.h"
Mikaël Salson's avatar
Mikaël Salson committed
10
11
#include "kmerstore.h"
#include "kmeraffect.h"
Mathieu Giraud's avatar
Mathieu Giraud committed
12
13
#include "affectanalyser.h"
#include "json.h"
Mikaël Salson's avatar
Mikaël Salson committed
14
15

#define EXTEND_D_ZONE 5
16
17
18

#define MIN_D_LENGTH 5          /* If a D-REGION is smaller than this threshold, it is not output */

Mikaël Salson's avatar
Mikaël Salson committed
19
20
21
#define RATIO_STRAND 5          /* The ratio between the affectations in one
                                   strand and the other, to safely attribute a
                                   segment to a given strand */
Mikaël Salson's avatar
Mikaël Salson committed
22

23
#define DETECT_THRESHOLD 5      /* If the number of both V and J affectations
24
25
26
27
                                   is above this threshold, then the sequence
                                   will be labeled as 'detected', and, if it
                                   not segmented, the remaining germlines will
                                   not be tested */
28

Mathieu Giraud's avatar
Mathieu Giraud committed
29
30
#define JSON_REMEMBER_BEST  4   /* The number of V/D/J predictions to keep  */

31
32
33
34
35
#define NO_LIMIT_VALUE  -1

#define THRESHOLD_NB_EXPECTED NO_LIMIT_VALUE /* Threshold of the accepted expected value for number of found k-mers */


36

Mikaël Salson's avatar
Mikaël Salson committed
37
38
39
40
using namespace std;

enum SEGMENTED { DONT_KNOW, SEG_PLUS, SEG_MINUS, UNSEG_TOO_SHORT, UNSEG_STRAND_NOT_CONSISTENT, 
		 UNSEG_TOO_FEW_ZERO,  UNSEG_TOO_FEW_V, UNSEG_TOO_FEW_J, 
41
		 UNSEG_BAD_DELTA_MIN, UNSEG_BAD_DELTA_MAX, UNSEG_AMBIGUOUS, UNSEG_NOISY,
Mikaël Salson's avatar
Mikaël Salson committed
42
43
44
45
		 TOTAL_SEG_AND_WINDOW, 
		 TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW,
		 STATS_SIZE } ;
const char* const segmented_mesg[] = { "?", "SEG_+", "SEG_-", "UNSEG too short", "UNSEG strand",  
46
				       "UNSEG too few (0)", "UNSEG too few V", "UNSEG too few J",
47
				       "UNSEG < delta_min", "UNSEG > delta_max", "UNSEG ambiguous",
48
                                       "UNSEG noisy",
Mikaël Salson's avatar
Mikaël Salson committed
49
                                       "= SEG, with window",
50
                                       "= SEG, no window",
Mikaël Salson's avatar
Mikaël Salson committed
51
                                      } ;
Mikaël Salson's avatar
Mikaël Salson committed
52
53
54
55
56

class Segmenter {
protected:
  string label;
  string sequence;
Mathieu Giraud's avatar
Mathieu Giraud committed
57
58
  int Vend, Jstart;
  int Dstart, Dend;
Marc Duez's avatar
Marc Duez committed
59
  int CDR3start, CDR3end;
Mikaël Salson's avatar
Mikaël Salson committed
60
  bool reversed, segmented, dSegmented;
61
  int because;
Mikaël Salson's avatar
Mikaël Salson committed
62
63
64
65
66
67

  string removeChevauchement();
  bool finishSegmentation();
  bool finishSegmentationD();

 public:
68
  Germline *segmented_germline;
Mikaël Salson's avatar
Mikaël Salson committed
69
  string code;
Mikaël Salson's avatar
Mikaël Salson committed
70
  string code_short;
Mikaël Salson's avatar
Mikaël Salson committed
71
  string code_light;
72
73
  string info;        // .vdj.fa header, fixed fields
  string info_extra;  // .vdj.fa header, other information, at the end of the header
Mikaël Salson's avatar
Mikaël Salson committed
74
75
  int best_V, best_J ;
  int del_V, del_D_left, del_D_right, del_J ;
76
  string seg_V, seg_N, seg_J, system;
77

Mikaël Salson's avatar
Mikaël Salson committed
78
79
80
81
  int best_D;
  string seg_N1, seg_D, seg_N2;
  Cost segment_cost;

Mathieu Giraud's avatar
Mathieu Giraud committed
82
  virtual ~Segmenter();
Mikaël Salson's avatar
Mikaël Salson committed
83

Mathieu Giraud's avatar
Mathieu Giraud committed
84
  /* Queries */
Mikaël Salson's avatar
Mikaël Salson committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135



  Sequence getSequence() const ;

  /**
   * @param l: length around the junction
   * @return the string centered on the junction (ie. at position
   *         (getLeft() + getRight())/2).
   *         The string has length l unless the original string 
   *         is not long enough.
   *         The junction is revcomp-ed if the original string comes from reverse
   *         strand.
   */
  string getJunction(int l) const;

  /**
   * @return the left position (on forward strand) of the segmentation.
   */
  int getLeft() const;
  
  /**
   * @return the right position (on forward strand) of the segmentation
   */
  int getRight() const;
  
  /**
   * @return the left position (on forward strand) of the D segmentation.
   */
  int getLeftD() const;
  
  /**
   * @return the right position (on forward strand) of the D segmentation
   */
  int getRightD() const;

  /**
   * @return true iff the string comes from reverse strand
   */
  bool isReverse() const;

  /**
   * @return true iff the sequence has been successfully segmented
   */
  bool isSegmented() const;
  
  /**
   * @return true if a D gene was found in the N region
   */
  bool isDSegmented() const;

136
137
138
139
140
141
142
143
144
  /**
   * @return the status of the segmentation. Tells if the Sequence has been segmented
   *         of if it has not, what the reason is.
   * @assert getSegmentationStatus() == SEG_PLUS || getSegmentationStatus() == SEG_MINUS
   *         <==> isSegmented()
   */
  int getSegmentationStatus() const;

  string getInfoLine() const;
Mikaël Salson's avatar
Mikaël Salson committed
145

146
147
148
149
150
  /**
   * @post status == SEG_PLUS || status == SEG_MINUS <==> isSegmented()
   */
  void setSegmentationStatus(int status);

Mikaël Salson's avatar
Mikaël Salson committed
151
152
153
154
155
156
157
158
159
160
161
  friend ostream &operator<<(ostream &out, const Segmenter &s);
};



ostream &operator<<(ostream &out, const Segmenter &s);



class KmerSegmenter : public Segmenter
{
Mathieu Giraud's avatar
Mathieu Giraud committed
162
 private:
163
  int detected;
164
  KmerAffectAnalyser *kaa;
Mikaël Salson's avatar
Mikaël Salson committed
165
166
167
168
 protected:
  string affects;

 public:
169
  bool isDetected() const;
170
  int score;
171
172

  KmerSegmenter();
Mikaël Salson's avatar
Mikaël Salson committed
173
174
175
  /**
   * Build a segmenter based on KmerSegmentation
   * @param seq: An object read from a FASTA/FASTQ file
176
   * @param germline: the germline
Mikaël Salson's avatar
Mikaël Salson committed
177
   */
178
  KmerSegmenter(Sequence seq, Germline *germline);
Mathieu Giraud's avatar
Mathieu Giraud committed
179

180
181
  KmerSegmenter(const KmerSegmenter &seg);

Mathieu Giraud's avatar
Mathieu Giraud committed
182
183
  ~KmerSegmenter();

184
185
186
187
188
  /**
   * @return probability that the score of the segmenter is 'at_least' or more
   */
  double getProbabilityAtLeastOrAbove(int at_least) const;

Mathieu Giraud's avatar
Mathieu Giraud committed
189
190
191
  /**
   * @return the KmerAffectAnalyser of the current sequence.
   */
192
  KmerAffectAnalyser *getKmerAffectAnalyser() const;
Mathieu Giraud's avatar
Mathieu Giraud committed
193

194
195
  void toJsonList(JsonList *seg);

Mathieu Giraud's avatar
Mathieu Giraud committed
196
 private:
197
  void computeSegmentation(int strand, KmerAffect left, KmerAffect right);
Mikaël Salson's avatar
Mikaël Salson committed
198
199
};

200
201
202

class KmerMultiSegmenter
{
203
204
 private:
  double threshold_nb_expected;
205
206
207
 public:
  /**
   * @param seq: An object read from a FASTA/FASTQ file
208
209
210
211
212
213
214
   * @param multigermline: the multigerm
   * @param threshold: threshold of randomly expected segmentation
   */
  KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                     double threshold = THRESHOLD_NB_EXPECTED);

  /**
215
   * @return expected number of Segmenter that would have yield the maximum score by chance
216
   */
217
218
  double getNbExpected() const;

219
220
  ~KmerMultiSegmenter();

221
  KmerSegmenter *the_kseg;
222
  MultiGermline *multi_germline;
223
224
225
};


Mikaël Salson's avatar
Mikaël Salson committed
226
227
228
class FineSegmenter : public Segmenter
{
 public:
Mikaël Salson's avatar
Mikaël Salson committed
229
230
231
232
   vector<pair<int, int> > score_V;
   vector<pair<int, int> > score_D;
   vector<pair<int, int> > score_J;
   
Mikaël Salson's avatar
Mikaël Salson committed
233
234
235
   /**
   * Build a fineSegmenter based on KmerSegmentation
   * @param seq: An object read from a FASTA/FASTQ file
236
   * @param germline: germline used
Mikaël Salson's avatar
Mikaël Salson committed
237
   */
238
  FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
239
240
241
  
  /**
  * extend segmentation from VJ to VDJ
242
  * @param germline: germline used
Mikaël Salson's avatar
Mikaël Salson committed
243
  */
244
  void FineSegmentD(Germline *germline);
Marc Duez's avatar
Marc Duez committed
245
  void findCDR3();
Mathieu Giraud's avatar
Mathieu Giraud committed
246

247
  void toJsonList(JsonList *seg);
Mikaël Salson's avatar
Mikaël Salson committed
248
  
Mikaël Salson's avatar
Mikaël Salson committed
249
250
251
252
253
};



#endif