segment.cpp 26.2 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
4
5
6
7
8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24
25
26
27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
Marc Duez's avatar
Marc Duez committed
29
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
30

Mathieu Giraud's avatar
Mathieu Giraud committed
31
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
32

Mathieu Giraud's avatar
Mathieu Giraud committed
33
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
54
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
55
56
57
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
58
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
59
60
61
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
62
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
63
64
65
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
66
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
67
68
69
70
71
72
73
74
75
76
77
78
79
80
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

81
82
83
84
bool KmerSegmenter::isDetected() const {
  return detected;
}

Mikaël Salson's avatar
Mikaël Salson committed
85
86
87
88
89
90
91
92
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
93
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
94
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
95
      int middle = (Vend + Jstart) / 2 ;
96
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
97
98
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
113
114
115
116
117
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
118
119
120
121
122
123
124
125
126
127
128
129
130

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
131
132
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
Mikaël Salson's avatar
Mikaël Salson committed
133
  
Mathieu Giraud's avatar
Mathieu Giraud committed
134
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
135
  
Mathieu Giraud's avatar
Mathieu Giraud committed
136
137
138
139
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
140
141
142
143
144
145
146
147
148
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

149
150
151
152
153
154
155
156
157
string Segmenter::getInfoLine() const
{
  string s = ">" ;

  s += label + " " ;
  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
158
159
160
161

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

162
163
164
165
166
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

167
168
169
  return s ;
}

Mikaël Salson's avatar
Mikaël Salson committed
170
171
ostream &operator<<(ostream &out, const Segmenter &s)
{
172
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
      out << s.sequence << endl ;
    }

  return out ;
}


// KmerSegmenter (Cheap)

191
192
193
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
Mikaël Salson's avatar
Mikaël Salson committed
194
195
196
197
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
198
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
199
  segmented = false;
200
  segmented_germline = germline ;
201
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
202
  Dend=0;
203
204
  because = 0 ; // Cause of unsegmentation
  score = 0 ;
205
  evalue = NO_LIMIT_VALUE;
206
207
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
208

209
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
210
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
211

212
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
213
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
214
215
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
216
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
217
218
    }
 
219
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
220
221
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
222
223
224
225
226
227
228
229
230
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
231
232
233
    }
  }

234
235
236
  KmerAffect before, after;

  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
237
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
238
    because = UNSEG_TOO_FEW_ZERO ;
239
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
240
241
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
242
243
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
244
245
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
246
247
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
248
249
250
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them aparat.
251
252
    because = UNSEG_STRAND_NOT_CONSISTENT ;
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
253
254
  }

255
  detected = false ;
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
  computeSegmentation(strand, before, after);

  if (! because)
    {
      // Now we check the delta between Vend and right
   
      if (Jstart - Vend < germline->delta_min)
	{
	  because = UNSEG_BAD_DELTA_MIN ;
	}

      if (Jstart - Vend > germline->delta_max)
	{
	  because = UNSEG_BAD_DELTA_MAX ;
	}
    } 
Mathieu Giraud's avatar
Mathieu Giraud committed
272

273
  if (!because)
Mathieu Giraud's avatar
Mathieu Giraud committed
274
275
    {
      // Yes, it is segmented
276
      segmented = true;
Mathieu Giraud's avatar
Mathieu Giraud committed
277
278
279
280
      reversed = (strand == -1); 
      because = reversed ? SEG_MINUS : SEG_PLUS ;

      info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;
281
282
      // removeChevauchement is called once info was already computed: it is only to output info_extra
      info_extra += removeChevauchement();
Mathieu Giraud's avatar
Mathieu Giraud committed
283
      finishSegmentation();
284
      system = germline->code;
285
      return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
286
    } 
287

288
 
Mathieu Giraud's avatar
Mathieu Giraud committed
289
}
Mikaël Salson's avatar
Mikaël Salson committed
290

Mathieu Giraud's avatar
Mathieu Giraud committed
291
KmerSegmenter::~KmerSegmenter() {
292
293
  if (kaa)
    delete kaa;
294
295
}

296
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented, double threshold)
297
{
298
  int best_score_seg = 0 ; // Best score, segmented sequences
299
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
300
  the_kseg = NULL;
301
  multi_germline = multigermline;
302
  threshold_nb_expected = threshold;
303
  
304
305
306
307
308
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

309
310
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline);
      bool keep_seg = false;
311

312
313
314
315
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
          *out_unsegmented << "#"
316
317
                           << left << setw(4) << kseg->segmented_germline->code << " "
                           << left << setw(20) << segmented_mesg[kseg->getSegmentationStatus()] << " ";
318

319
          *out_unsegmented << right << setw(3) << kseg->score << " ";
320
          
321
322
          if (kseg->getSegmentationStatus() != UNSEG_TOO_SHORT) 
            *out_unsegmented << kseg->getKmerAffectAnalyser()->toString();
323
324
325
326

          *out_unsegmented << endl ;
        }

327
328
      // Always remember the first kseg
      if (the_kseg == NULL)
329
        keep_seg = true;
330
      
331
      if (kseg->isSegmented())
332
333
        {
          // Yes, it is segmented
334
          // Should we keep the kseg ?
335
          if (kseg->score > best_score_seg)
336
            {
337
              keep_seg = true;
338
              best_score_seg = kseg->score ;
339
            }
340
        }
341
342
343
344
345
346
347
348
349
350
351
352
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
              if (!best_score_seg)
                keep_seg = true;
            }
        }
      
353
      if (keep_seg) {
354
355
        if (the_kseg)
          delete the_kseg;
356
357
358
359
        the_kseg = kseg;
      } else {
        delete kseg;
      }
360
    } // end for (Germlines)
361

362
  // E-value threshold
363
  if (threshold_nb_expected > NO_LIMIT_VALUE)
364
365
366
367
368
369
    if (the_kseg->isSegmented()) {
        // the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
        the_kseg->evalue = getNbExpected();
        if (the_kseg->evalue > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY);
        }
370

371
        pair <double, double> p = getNbExpectedLeftRight();
372
373
374
375
376
377
378
379
380
        the_kseg->evalue_left = p.first;
        the_kseg->evalue_right = p.second;

        if (the_kseg->evalue_left > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY); // TOO_FEW_V ?
        }
        if (the_kseg->evalue_right > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY); // TOO_FEW_J ?
        }
381
      }
382
383
}

384
double KmerMultiSegmenter::getNbExpected() const {
385
386
387
  pair <double, double> p = getNbExpectedLeftRight();
  return (p.first + p.second);
}
388
389
390
391

pair<double,double> KmerMultiSegmenter::getNbExpectedLeftRight() const {
  pair <double, double> p = the_kseg->getKmerAffectAnalyser()->getLeftRightProbabilityAtLeastOrAbove();
  return pair<double, double>(p.first * multi_germline->germlines.size(), p.second * multi_germline->germlines.size());
392
393
394
}

KmerMultiSegmenter::~KmerMultiSegmenter() {
395
396
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
397
398
}

399
400
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after) {
  // Try to segment, computing 'Vend' and 'Jstart'
401
402
  // If not segmented, put the cause of unsegmentation in 'because'

403
  affect_infos max;
Mikaël Salson's avatar
Mikaël Salson committed
404

405
  max = kaa->getMaximum(before, after); 
406
407
408
409
410

      // We labeled it detected if there were both enough affect_5 and enough affect_3
      detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
        && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
      
411
      if (! max.max_found) {
412
413
        if ((strand == 1 && max.nb_before_left == 0)
            || (strand == -1 && max.nb_after_right == 0)) 
414
          because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
415
416
        else if ((strand == 1 && max.nb_after_right == 0)
                 || (strand == -1 && max.nb_before_left == 0))
Mikaël Salson's avatar
Mikaël Salson committed
417
	{
418
	  because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
419
	} else 
420
421
422
423
424
425
426
427
428
429
          because = UNSEG_AMBIGUOUS; 
      } else {
        Vend = max.first_pos_max;
        Jstart = max.last_pos_max + 1;
        if (strand == -1) {
          int tmp = sequence.size() - Vend - 1;
          Vend = sequence.size() - Jstart - 1;
          Jstart = tmp;
        }
      }
Mikaël Salson's avatar
Mikaël Salson committed
430
  
431
    score = max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right;  
Mathieu Giraud's avatar
Mathieu Giraud committed
432
}
Mikaël Salson's avatar
Mikaël Salson committed
433

434
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
435
436
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
437

438
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
439
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
440
441
}

442
443
444
445
446
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
// FineSegmenter

void best_align(int overlap, string seq_left, string seq_right,
		string ref_left ,string ref_right, int *b_r, int *b_l, Cost segment_cost)
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
      ref_left=string(ref_left.rbegin(), ref_left.rend());
      seq_left=string(seq_left.rbegin(), seq_left.rend()); 
      
      DynProg dp1 = DynProg(seq_left, ref_left,
			   DynProg::Local, segment_cost);
      score_l[0] = dp1.compute();
      dp1.backtrack();
      
      //RIGHT
      DynProg dp = DynProg(seq_right, ref_right,
			   DynProg::Local, segment_cost);
      score_r[0] = dp.compute();
      dp.backtrack();    
      
      for(int i=1; i<=overlap; i++){
	if(dp.best_i-i >0)
472
	score_r[i] = dp.B[dp.best_i-i][dp.best_j].score;
Mikaël Salson's avatar
Mikaël Salson committed
473
474
475
	else
	score_r[i] =0;
	if(dp1.best_i-i >0)
476
	score_l[i] = dp1.B[dp1.best_i-i][dp1.best_j].score;	
Mikaël Salson's avatar
Mikaël Salson committed
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
	else
	score_l[i] =0;
      }
      int score=-1000000;
      int best_r=0;
      int best_l=0;

      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
	  if ( ((score_r[i]+score_l[j]) == score) && (i+j < best_r+best_l )){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	  if ( (score_r[i]+score_l[j]) > score){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	}
      }
      *b_r=best_r;
      *b_l=best_l;
}

Mikaël Salson's avatar
Mikaël Salson committed
502
503
504
505
506
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
507
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
508
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
509
510
511
512
513
514
515
516
517
518
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
519
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
520
521
522
523
524
525
526
527
528
529
530

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
531
      
Mikaël Salson's avatar
Mikaël Salson committed
532
533
534
535
536
537
538
539
540
541
542
543
544
545
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
546
547
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
548
549
550
551
552
553

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
554
555

    }
Mikaël Salson's avatar
Mikaël Salson committed
556
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
557
558
559
560

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
561
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
562
563
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
564
565
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
566
567
568
569
570
571

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
572
573
574
575
576
577
578
579
580
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

581
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
Mikaël Salson's avatar
Mikaël Salson committed
582
{
583
584
  segmented = false;
  dSegmented = false;
585
  because = 0 ;
586
  segmented_germline = germline ;
587
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
588
589
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
590
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
591
  segment_cost=segment_c;
592
  evalue = NO_LIMIT_VALUE;
593
594
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
595

596
597
598
  CDR3start = -1;
  CDR3end = -1;
  
Mikaël Salson's avatar
Mikaël Salson committed
599
600
601
602
603
  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
604
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
605
606
607
608
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
609
610
611
612
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
613
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
614
615
					   &plus_length, &score_plus_V
					   , segment_cost);
616
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
617
618
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
619
620
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
621
622
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
623
624
625
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
626
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
627
628
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
629
630
631
632
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
633
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
634
635
					    &minus_length, &score_minus_V
					    ,  segment_cost);
636
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
637
638
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
639
640
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
641
642
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
643
644
645
646
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
647
648
649
650
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
651
652
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
653
654
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
655
656
657
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
658
659
660
661
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
662
663
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
664
665
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
666
667
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
668
  segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) && 
669
    (Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
Mikaël Salson's avatar
Mikaël Salson committed
670
671
672
    
  if (!segmented)
    {
673
      because = DONT_KNOW;
Mathieu Giraud's avatar
Mathieu Giraud committed
674
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
675
      
676
      if (Jstart - Vend < germline->delta_min)
677
678
679
680
        {
          because = UNSEG_BAD_DELTA_MIN  ;
        }

681
      if (Jstart - Vend > germline->delta_max)
682
683
684
685
686
687
688
689
690
691
692
693
694
695
        {
          because = UNSEG_BAD_DELTA_MAX  ;
        }
        
      if (Vend == (int) string::npos) 
        {
          because = UNSEG_TOO_FEW_V ;
        }
      
      if (Jstart == (int) string::npos)
        {
          because = UNSEG_TOO_FEW_J ;
        }
      
Mikaël Salson's avatar
Mikaël Salson committed
696
697
698
      return ;
    }
    
699
700
    because = reversed ? SEG_MINUS : SEG_PLUS ;
    
Mikaël Salson's avatar
Mikaël Salson committed
701
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
702
    if(Jstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
703
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
704
      int overlap=Vend-Jstart+1;
Mikaël Salson's avatar
Mikaël Salson committed
705
      
Mathieu Giraud's avatar
Mathieu Giraud committed
706
707
      string seq_left = sequence.substr(0, Vend+1);
      string seq_right = sequence.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
708
709

      best_align(overlap, seq_left, seq_right, 
710
		 germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
711
      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
712
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
713
714
715
      del_V += b_l;

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
716
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
717
      del_J += b_r;
718
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
719
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
720
721
722
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
723
724

    /// used only below, then recomputed in finishSegmentation() ;
Mathieu Giraud's avatar
Mathieu Giraud committed
725
    seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1); 
Mikaël Salson's avatar
Mikaël Salson committed
726

727
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
728
729
730
731
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
732
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
733

Mikaël Salson's avatar
Mikaël Salson committed
734
    stringstream code_s;
735
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
736
737
738
739
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
740
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
741
742
    code_short=code_s.str();
    
743
744
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
745
746

 
Mathieu Giraud's avatar
Mathieu Giraud committed
747
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
748
749
750
751
  finishSegmentation();
}


752
void FineSegmenter::FineSegmentD(Germline *germline){
Mikaël Salson's avatar
Mikaël Salson committed
753
754
755
756
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
757
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
758
759
760
761
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
762
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
763
764
765
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
766
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
767
768
769
770
771
772
773

    if (r > (int)getSequence().sequence.length()) 
      r = getSequence().sequence.length();
      
    string str = getSequence().sequence.substr(l, r-l);

    // Align
774
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
775
776
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
777
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
778
    
Mathieu Giraud's avatar
Mathieu Giraud committed
779
780
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
781
782
	
    string seq = getSequence().sequence;
783
784
785
786
787
788
789
790
791
792


    // recompute remaining length for D
    length = germline->rep_4.sequence(best_D).length() - del_D_right - del_D_left;

    if (length < MIN_D_LENGTH)
      return ;


    dSegmented=true;
Mikaël Salson's avatar
Mikaël Salson committed
793
794
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
795
    if(Dstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
796
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
797
798
799
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
800
801

      best_align(overlap, seq_left, seq_right, 
802
		 germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
803
804

      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
805
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
806
807
808
      del_V += b_l;

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
809
      Dstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
810
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
811
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
812
813
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
814
    if(Jstart-Dend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
815
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
816
817
818
      int overlap=Dend-Jstart+1;
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
      string seq_left = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
819
820

      best_align(overlap, seq_left, seq_right, 
821
		 germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
822
823

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
824
      Dend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
825
826

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
827
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
828
829
830
      del_J += b_r;

    }
Mathieu Giraud's avatar
Mathieu Giraud committed
831
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
832
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
833
834
835
836
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
837
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
838
839
840
841
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
842
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
843

Mikaël Salson's avatar
Mikaël Salson committed
844
    stringstream code_s;
845
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
846
847
848
849
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
850
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
851
852
853
854
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
855
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
856
857
858
    code_short=code_s.str();
    
    
859
860
861
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
862
863
864
865
    
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
866

Marc Duez's avatar
Marc Duez committed
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
886
        while ( loc != string::npos && loc < (size_t)Vend){
Marc Duez's avatar
Marc Duez committed
887
            loc = str.find(*it, loc+3);
888
            if (loc != string::npos && loc < (size_t)Vend) {
Marc Duez's avatar
Marc Duez committed
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
                p_start.push_front(loc);
            }
        }
    }

    for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
        loc = Jstart;
        while ( loc != string::npos){
            loc = str.find(*it, loc+3);
            if (loc != string::npos) {
                p_end.push_back(loc);
            }
        }
    }

904
905
    CDR3start = -1;
    CDR3end = -1;
Marc Duez's avatar
Marc Duez committed
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
    
    std::list<int>::const_iterator it1;
    for (it1 = p_start.begin(); it1 != p_start.end(); ++it1) {
        
        std::list<int>::const_iterator it2;
        for (it2 = p_end.begin(); it2 != p_end.end(); ++it2) {
            
            if ( (*it2-*it1)%3 == 0){       //filter 3 : start/stop codon must be seprated by a multiple of 3
                
                if ( fabs((*it2-*it1)-36 ) < fabs((CDR3end-CDR3start)-36) ){ //filter 4 : cdr3 length must be close to 12 AA
                    CDR3start = *it1;
                    CDR3end = *it2;
                }
            }
        }
    }
    
}

925
void FineSegmenter::toJsonList(JsonList *seg){
Tatiana Rocher's avatar
Tatiana Rocher committed
926

Mathieu Giraud's avatar
Mathieu Giraud committed
927
  if (isSegmented()) {
928
929
930
    seg->add("5", segmented_germline->rep_5.label(best_V));
    seg->add("5start", 0);
    seg->add("5end", Vend);
Mathieu Giraud's avatar
Mathieu Giraud committed
931
932
    
    if (score_D.size()>0){
933
934
935
      seg->add("4", segmented_germline->rep_4.label(best_D));
      seg->add("4start", Dstart);
      seg->add("4end", Dend);
Mikaël Salson's avatar
Mikaël Salson committed
936
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
937
    
938
939
    seg->add("3", segmented_germline->rep_3.label(best_J));
    seg->add("3start", Jstart);
940
941
942

    if (CDR3start >= 0)
      {
Marc Duez's avatar
Marc Duez committed
943
944
945
946
    JsonList *json_cdr;
    json_cdr=new JsonList();
    json_cdr->add("start", CDR3start);
    json_cdr->add("stop", CDR3end);
947
    seg->add("cdr3", *json_cdr);
948
      }
Tatiana Rocher's avatar
Tatiana Rocher committed
949

950
951
  }
}
952

953
954
void KmerSegmenter::toJsonList(JsonList *seg)
{
955
956
    int sequenceSize = sequence.size();

957
    if (evalue > NO_LIMIT_VALUE)
958
      seg->add("_evalue", scientific_string_of_double(evalue));
959
960
961
962
    if (evalue_left > NO_LIMIT_VALUE)
      seg->add("_evalue_left", scientific_string_of_double(evalue_left));
    if (evalue_right > NO_LIMIT_VALUE)
      seg->add("_evalue_right", scientific_string_of_double(evalue_right));
963

964
965
966
967
    JsonList *json_affectValues;
    json_affectValues=new JsonList();
    json_affectValues->add("start", 0);
    json_affectValues->add("stop", sequenceSize); 
968
969
    json_affectValues->add("seq", getKmerAffectAnalyser()->toStringValues());
    seg->add("affectValues", *json_affectValues);
Tatiana Rocher's avatar
Tatiana Rocher committed
970
971
      

972
    JsonList *json_affectSigns;
973
974
975
    json_affectSigns=new JsonList();
    json_affectSigns->add("start", 0);
    json_affectSigns->add("stop", sequenceSize); 
976
977
    json_affectSigns->add("seq", getKmerAffectAnalyser()->toStringSigns());
    seg->add("affectSigns", *json_affectSigns);
978
979
980

    delete json_affectValues;
    delete json_affectSigns;
Mikaël Salson's avatar
Mikaël Salson committed
981
982
983
}