segment.cpp 32.3 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
3
  Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016 by Bonsai bioinformatics
4
5
6
7
8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24
25
26
27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
29
#include <cstring>
Marc Duez's avatar
Marc Duez committed
30
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
31

32
#define NO_FORBIDDEN_ID (-1)
33

34
35
36
AlignBox::AlignBox(string _key) {
  key = _key;

37
38
39
40
41
42
43
44
45
46
47
48
49
50
  del_left = 0 ;
  start = 0 ;
  end = 0 ;
  del_right = 0 ;

  ref_nb = 0 ;
  ref = "";
  ref_label = "";
}

string AlignBox::getSequence(string sequence) {
  return sequence.substr(start, end-start+1);
}

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
void AlignBox::addToJson(json &seg) {

  seg[key] = ref_label;

  if (key == "5")
    {
      seg[key+"end"] = end;
      seg[key+"del"] = del_right;
    }
  else if (key == "3")
    {
      seg[key+"del"] = del_left;
      seg[key+"start"] = start;
    }
  else
    {
      seg[key+"delLeft"] = del_left;
      seg[key+"start"] = start;
      seg[key+"end"] = end;
      seg[key+"delRight"] = del_right;
    }
}
73

74
75
76
77
78
79
80
81
82
83
84
ostream &operator<<(ostream &out, const AlignBox &box)
{
  out << "[/" << box.del_left << " " ;
  out << "@" << box.start << " " ;
  out << box.ref_label << "(" << box.ref_nb << ") " ;
  out << "@" << box.end << " " ;
  out << box.del_right << "/]" ;

  return out ;
}

85
86
87
88
89
90
91
92
93
94
string codeFromBoxes(vector <AlignBox*> boxes, string sequence)
{
  string code = "";

  int n = boxes.size();

  for (int i=0; i<n; i++) {

    if (i>0) {
      code += " " + string_of_int(boxes[i-1]->del_right) + "/"
Mathieu Giraud's avatar
Mathieu Giraud committed
95
        // From box_left->end + 1 to box_right->start - 1, both positions included
96
97
98
99
100
101
102
103
104
        + sequence.substr(boxes[i-1]->end + 1, boxes[i]->start - boxes[i-1]->end - 1)
        + "/" + string_of_int(boxes[i]->del_left) + " " ;
    }

    code += boxes[i]->ref_label ;
  }

  return code;
}
105

106

Mathieu Giraud's avatar
Mathieu Giraud committed
107
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
108

Mathieu Giraud's avatar
Mathieu Giraud committed
109
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
130
  return box_V->end;
Mikaël Salson's avatar
Mikaël Salson committed
131
132
133
}
  
int Segmenter::getRight() const {
134
  return box_J->start;
Mikaël Salson's avatar
Mikaël Salson committed
135
136
137
}

int Segmenter::getLeftD() const {
138
  return box_D->start;
Mikaël Salson's avatar
Mikaël Salson committed
139
140
141
}
  
int Segmenter::getRightD() const {
142
  return box_D->end;
Mikaël Salson's avatar
Mikaël Salson committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

// E-values

void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
  if (threshold == NO_LIMIT_VALUE)
    return ;

  if (evalue_left >= threshold && evalue_right >= threshold)
    because = UNSEG_TOO_FEW_ZERO ;
  else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
    because = UNSEG_TOO_FEW_V ;
  else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
    because = UNSEG_TOO_FEW_J ;
  else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
    because = UNSEG_TOO_FEW_ZERO ;
}


Mikaël Salson's avatar
Mikaël Salson committed
176
177
178
179
180
181
182
183
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

184
  if (box_V->end >= box_J->start)
Mikaël Salson's avatar
Mikaël Salson committed
185
    {
186
187
188
189
      int middle = (box_V->end + box_J->start) / 2 ;
      chevauchement = " !ov " + string_of_int (box_V->end - box_J->start + 1);
      box_V->end = middle ;
      box_J->start = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
204
205
206
207
208
  seg_V = seq.substr(0, box_V->end+1) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(box_J->start) ;
  box_D->start=0;
  box_D->end=0;
Mikaël Salson's avatar
Mikaël Salson committed
209
210
211
212
213
214
215
216
217
218
219
220
221

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

222
223
224
225
  seg_V = seq.substr(0, box_V->end+1) ; // From pos. 0 to box_V->end
  seg_J = seq.substr(box_J->start) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_D  = seq.substr(box_D->start, box_D->end-box_D->start+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
226
  
227
228
229
230
  info = "VDJ \t0 " + string_of_int(box_V->end) +
                " " + string_of_int(box_D->start) +
		" " + string_of_int(box_D->end) +
		" " + string_of_int(box_J->start) +
Mikaël Salson's avatar
Mikaël Salson committed
231
232
233
234
235
236
237
238
239
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

240
241
string Segmenter::getInfoLine() const
{
242
  string s = "" ;
243
244
245
246
247

  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
248
249
250
251

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

252
253
254
255
256
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

257
258
259
260
  if (CDR3start > 0)
    s += " {CDR3: " + string_of_int(CDR3start) + "(" + string_of_int(CDR3end-CDR3start+1) + ")" + string_of_int(CDR3end) + " "
      + "up"[CDR3productive] + " " + JUNCTIONaa + " " + CDR3nuc + "}";

261
262
263
  return s ;
}

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
string KmerSegmenter::getInfoLineWithAffects() const
{
   stringstream ss;

   ss << "# "
      << right << setw(3) << score << " "
      << left << setw(30)
      << getInfoLine() ;

   if (getSegmentationStatus() != UNSEG_TOO_SHORT)
     ss << getKmerAffectAnalyser()->toString();

   return ss.str();
}


Mikaël Salson's avatar
Mikaël Salson committed
280
281
ostream &operator<<(ostream &out, const Segmenter &s)
{
282
  out << ">" << s.label << " " ;
283
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
284
285
286
287
288
289
290
291
292

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
293
      out << s.getSequence().sequence << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
294
295
296
297
298
299
300
301
    }

  return out ;
}


// KmerSegmenter (Cheap)

302
303
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

304
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
305
{
306
307
308
309
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

310
311
312
  CDR3start = -1;
  CDR3end = -1;

Mikaël Salson's avatar
Mikaël Salson committed
313
314
315
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
316
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
317
  segmented = false;
318
  segmented_germline = germline ;
319
  system = germline->code; // useful ?
320
  reversed = false;
321
  because = NOT_PROCESSED ; // Cause of unsegmentation
322
  score = 0 ;
323
  evalue = NO_LIMIT_VALUE;
324
325
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
326

327
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
328
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
329

330
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
331
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
332
333
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
334
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
335
336
    }
 
337
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
338
339
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
340
341
342
343
344
345
346
347
348
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
349
350
351
    }
  }

352
353
  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

354
355
  reversed = (nb_strand[0] > nb_strand[1]) ;

356
357
  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
358
359
360
361
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
      CountKmerAffectAnalyser ckaa(*(germline->index), sequence);

362
363
364
365
366

      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

367
      if (germline->seg_method == SEG_METHOD_MAX12)
368
369
370
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);
371

372
373
374
375
376
377
          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }
378

379
380
      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
381
        {
382
383
384
385
386
387
388
389
390
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
391
392
        }

393
394
395
396
397
      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

398
399
400
      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
401
      strand = reversed ? -1 : 1 ;
402
403
404
405
406
    }

  else
    { // Regular germline

407
  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
408
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
409
    because = UNSEG_TOO_FEW_ZERO ;
410
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
411
412
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
413
414
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
415
416
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
417
418
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
419
420
  } else {
    // Ambiguous information: we have positive and negative strands
421
422
423
424
425
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
426
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
427
428
  }

429
    } // endif Pseudo-germline
430
 
431
  computeSegmentation(strand, before, after, threshold, multiplier);
Mathieu Giraud's avatar
Mathieu Giraud committed
432
}
Mikaël Salson's avatar
Mikaël Salson committed
433

Mathieu Giraud's avatar
Mathieu Giraud committed
434
KmerSegmenter::~KmerSegmenter() {
435
436
  if (kaa)
    delete kaa;
437
438
439
440

  delete box_V;
  delete box_D;
  delete box_J;
441
442
}

443
444
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                                       double threshold, int nb_reads_for_evalue)
445
{
446
447
  bool found_seg = false ; // Found a segmentation
  double best_evalue_seg = NO_LIMIT_VALUE ; // Best evalue, segmented sequences
448
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
449
  the_kseg = NULL;
450
  multi_germline = multigermline;
451
  threshold_nb_expected = threshold;
452
453
454

  // E-value multiplier
  int multiplier = multi_germline->germlines.size() * nb_reads_for_evalue;
455
  
456
457
458
459
460
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

461
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, threshold, multiplier);
462
      bool keep_seg = false;
463

464
465
466
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
467
          *out_unsegmented << kseg->getInfoLineWithAffects() << endl ;
468
469
        }

470
471
      // Always remember the first kseg
      if (the_kseg == NULL)
472
        keep_seg = true;
473
      
474
      if (kseg->isSegmented())
475
476
        {
          // Yes, it is segmented
477
          // Should we keep the kseg ?
478
          if (!found_seg || (kseg->evalue < best_evalue_seg))
479
            {
480
              keep_seg = true;
481
482
483
              best_evalue_seg = kseg->evalue ;

              found_seg = true;
484
            }
485
        }
486
487
488
489
490
491
492
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
493
              if (!found_seg)
494
495
496
497
                keep_seg = true;
            }
        }
      
498
      if (keep_seg) {
499
500
        if (the_kseg)
          delete the_kseg;
501
502
503
504
        the_kseg = kseg;
      } else {
        delete kseg;
      }
505
506
507
    } // end for (Germlines)
}

508
KmerMultiSegmenter::~KmerMultiSegmenter() {
509
510
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
511
512
}

513
514
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
                                        double threshold, int multiplier) {
515
  // Try to segment, computing 'box_V->end' and 'box_J->start'
516
517
  // If not segmented, put the cause of unsegmentation in 'because'

518
  affect_infos max;
519
  max = kaa->getMaximum(before, after);
Mikaël Salson's avatar
Mikaël Salson committed
520

521
522
523
  // We did not find a good segmentation point
  if (!max.max_found) {
    // We labeled it detected if there were both enough affect_5 and enough affect_3
524
525
526
527
528
529
530
531
532
    bool detected_before = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD);
    bool detected_after = (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);

    if (detected_before && detected_after)
      because = UNSEG_AMBIGUOUS ;
    else if ((strand == 1 && detected_before) || (strand == -1 && detected_after))
      because = UNSEG_TOO_FEW_J ;
    else if ((strand == 1 && detected_after) || (strand == -1 && detected_before))
      because = UNSEG_TOO_FEW_V ;
533
    else
534
      because = UNSEG_TOO_FEW_ZERO ;
535
536
537
538
539

    return ;
  }


540
  // E-values
541
542
543
  pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
  evalue_left = pvalues.first * multiplier ;
  evalue_right = pvalues.second * multiplier ;
544
  evalue = evalue_left + evalue_right ;
545

546
  checkLeftRightEvaluesThreshold(threshold, strand);
547

548
  if (because != NOT_PROCESSED)
549
550
    return ;

551
552
   // There was a good segmentation point

553
554
   box_V->end = max.first_pos_max;
   box_J->start = max.last_pos_max + 1;
555
   if (strand == -1) {
556
557
558
     int tmp = sequence.size() - box_V->end - 1;
     box_V->end = sequence.size() - box_J->start - 1;
     box_J->start = tmp;
559
560
561
562
563
564
   }

  // Yes, it is segmented
  segmented = true;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

565
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS)  ;
566
567
568
569
570
571

  // removeChevauchement is called once info was already computed: it is only to output info_extra
  info_extra += removeChevauchement();
  finishSegmentation();

  return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
572
}
Mikaël Salson's avatar
Mikaël Salson committed
573

574
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
575
576
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
577

578
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
579
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
580
581
}

582
583
584
585
586
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
587
588
// FineSegmenter

589

590
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
591
592
                                 AlignBox *box_left, AlignBox *box_right,
                                 Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
593
{
594
  // Overlap size
595
  int overlap = box_left->end - box_right->start + 1;
596
597
598

  if (overlap > 0)
    {
599
600
      string seq_left = seq.substr(seq_begin, box_left->end - seq_begin + 1);
      string seq_right = seq.substr(box_right->start, seq_end - box_right->start + 1);
601

Mikaël Salson's avatar
Mikaël Salson committed
602
603
604
605
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
606
      DynProg dp_l = DynProg(seq_left, box_left->ref,
Mikaël Salson's avatar
Mikaël Salson committed
607
			   DynProg::Local, segment_cost);
608
      score_l[0] = dp_l.compute();
609

Mikaël Salson's avatar
Mikaël Salson committed
610
611
      
      //RIGHT
612
      // reverse right sequence
613
      string ref_right=string(box_right->ref.rbegin(), box_right->ref.rend());
614
615
616
      seq_right=string(seq_right.rbegin(), seq_right.rend());


617
      DynProg dp_r = DynProg(seq_right, ref_right,
Mikaël Salson's avatar
Mikaël Salson committed
618
			   DynProg::Local, segment_cost);
619
      score_r[0] = dp_r.compute();
620
621
622
623
624
625
626
627
628



      int trim_l[overlap+1];
      int trim_r[overlap+1];

      for(int i=0; i<=overlap; i++) {
        score_l[i] = i < seq_left.size()  ? dp_l.best_score_on_i(seq_left.size()  - i, trim_l + i) : MINUS_INF ;
        score_r[i] = i < seq_right.size() ? dp_r.best_score_on_i(seq_right.size() - i, trim_r + i) : MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
629
      }
630

631
632
633
634
635
636

// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
      cout << dp_l ;
      cout << dp_r ;

637
638
      cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
      cout << "ref:" << ref_left << "\t\t" << ref_right << endl;
639
640
641
642
643
644
645
646
      for(int i=0; i<=overlap; i++)
        cout << i << "  left: " << score_l[i] << "/" << trim_l[i] << "     right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif

      int score = MINUS_INF;
      int best_i = 0 ;
      int best_j = 0 ;

Mikaël Salson's avatar
Mikaël Salson committed
647

648
      // Find (i, j), with i+j >= overlap,
649
      // maximizing score_l[j] + score_r[i]
Mikaël Salson's avatar
Mikaël Salson committed
650
651
      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
652
          int score_ij = score_l[i] + score_r[j];
653

654
655
656
	  if (score_ij > score) {
            best_i = i ;
            best_j = j ;
657
658
            box_left->del_right = box_left->ref.size() - trim_l[i];
	    box_right->del_left = box_right->ref.size() - trim_r[j];
659
	    score = score_ij;
Mikaël Salson's avatar
Mikaël Salson committed
660
661
662
	  }
	}
      }
663

664
665
      box_left->end -= best_i ;
      box_right->start += best_j ;
666
667

#ifdef DEBUG_OVERLAP
668
      cout << "overlap: " << overlap << ", " << "best_overlap_split: " << score
669
670
           << "    left: " << best_i << "-" << box_left->del_right << " @" << box_left->end
           << "    right:" << best_j << "-" << box_right->del_left << " @" << box_right->start
671
           << endl;
672
#endif
673
674
    } // end if (overlap > 0)

675
676
  // From box_left->end + 1 to box_right->start - 1
  return seq.substr(box_left->end + 1, box_right->start - box_left->end - 1);
Mikaël Salson's avatar
Mikaël Salson committed
677
678
}

Mikaël Salson's avatar
Mikaël Salson committed
679
680
681
682
683
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

684
685
686
687
688

/**
 * Align a read against a collection of sequences, maximizing the alignment 'score'
 * @param read:         the read
 * @param rep:          a collection of reference sequences
689
 * @param reverse_ref:  if true, reverse the reference sequences (VkVk)
690
 * @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
691
 * @param local:        if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions and onlyBottomTriangle (V and J segments)
692
 * @param box:          the AligBox to fill
693
 * @param segment_cost: the cost used by the dynamic programing
694
 * @post  box is filled
695
696
 */

697
698
void align_against_collection(string &read, Fasta &rep, int forbidden_rep_id,
                              bool reverse_ref, bool reverse_both, bool local,
699
                             AlignBox *box, Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
700
701
702
{
  
  int best_score = MINUS_INF ;
703
  box->ref_nb = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
704
705
706
707
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
708

Mikaël Salson's avatar
Mikaël Salson committed
709
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
710
711
712

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
713
714
715

  // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence
  string sequence_or_rc = revcomp(read, reverse_ref);
Mikaël Salson's avatar
Mikaël Salson committed
716
717
718
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
719
720
721
      if (r == forbidden_rep_id)
        continue;

722
      DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
Mikaël Salson's avatar
Mikaël Salson committed
723
724
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
725
726
			   reverse_both, reverse_both,
                          rep.read(r).marked_pos);
727
728
729

      bool onlyBottomTriangle = !local ;
      int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
Mikaël Salson's avatar
Mikaël Salson committed
730
      
Mikaël Salson's avatar
Mikaël Salson committed
731
732
733
734
735
736
737
738
739
740
741
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
742
743
	  box->ref_nb = r ;
	  box->ref_label = rep.label(r) ;
744
745
746
747

          if (!local)
            dp.backtrack();
          box->marked_pos = dp.marked_pos_i ;
Mikaël Salson's avatar
Mikaël Salson committed
748
	}
Mikaël Salson's avatar
Mikaël Salson committed
749
750
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
751
752
753
754
755
756

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
757
758

    }
Mikaël Salson's avatar
Mikaël Salson committed
759
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
760

761
762
763
764
  box->ref = rep.sequence(box->ref_nb);
  box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
  box->del_left = best_first_j;
  box->start = best_first_i;
Mikaël Salson's avatar
Mikaël Salson committed
765
  
766
  box->score = score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
767
768

#ifdef DEBUG_SEGMENT	
769
  cout << "best: " << box->ref_label << " " << best_score ;
770
  cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
771
772
  cout << endl;
#endif
773
774
775
776

  if (reverse_ref)
    // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
    best_best_i = read.length() - best_best_i - 1 ;
777
778

  box->end = best_best_i ;
Mikaël Salson's avatar
Mikaël Salson committed
779
780
781
782
783
784
785
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

786
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
787
{
788
789
790
  box_V = new AlignBox("5");
  box_D = new AlignBox("4");
  box_J = new AlignBox("3");
791

792
793
  segmented = false;
  dSegmented = false;
794
  because = NOT_PROCESSED ;
795
  segmented_germline = germline ;
796
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
797
798
799
  label = seq.label ;
  sequence = seq.sequence ;
  segment_cost=segment_c;
800
  evalue = NO_LIMIT_VALUE;
801
802
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
803

804
805
  CDR3start = -1;
  CDR3end = -1;
806

807
808
809
  bool reverse_V = false ;
  bool reverse_J = false ;

810
  if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U))
811
    {
812
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
813
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
814
      if (kseg->isSegmented())
815
        {
816
817
818
819
820
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

821
822
          delete kseg ;

823
824
825
          reverse_V = (left.getStrand() == -1);
          reverse_J = (right.getStrand() == -1);

826
          code = "Unexpected ";
827

828
          code += left.toStringSigns() + germline->index->getLabel(left).basename;
829
          code += "/";
830
          code += right.toStringSigns() + germline->index->getLabel(right).basename;
831
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";
832
833
834
835
836

          if (germline->seg_method == SEG_METHOD_MAX1U)
            return ;

          germline->override_rep5_rep3_from_labels(left, right);
837
        }
838
      else
839
840
841
842
        {
          delete kseg ;
          return ;
        }
843
    }
844

845
846
847
848
849
850
851
  // Strand determination, with KmerSegmenter (with default e-value parameters)
  // Note that we use only the 'strand' component
  // When the KmerSegmenter fails, continue with positive strand
  // TODO: flag to force a strand / to test both strands ?

  KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
  reversed = kseg->isReverse();
852
  delete kseg ;
Mikaël Salson's avatar
Mikaël Salson committed
853
  
854
  sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed
855

Mathieu Giraud's avatar
Mathieu Giraud committed
856
857

  /* Segmentation */
858
  align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
859
                                        box_V, segment_cost);
860

861
  align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
862
                                          box_J, segment_cost);
863
864
865
866
867

  // J was run with '!reverseJ', we copy the box informations from right to left
  // Should this directly be handled in align_against_collection() ?
  box_J->start = box_J->end ;
  box_J->del_left = box_J->del_right;
868

869
  /* E-values */
870
871
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first);
872
  evalue = evalue_left + evalue_right ;
873
874

  /* Unsegmentation causes */
875
  if (box_V->end == (int) string::npos)
876
    {
877
      evalue_left = BAD_EVALUE ;
878
    }
879
      
880
  if (box_J->start == (int) string::npos)
881
    {
882
      evalue_right = BAD_EVALUE ;
883
884
    }

885
886
  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

887
888
889
  if (because != NOT_PROCESSED)
    {
      segmented = false;
890
      info = " @" + string_of_int (box_V->end + FIRST_POS) + "  @" + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
891
892
      return ;
    }
893
894
895
896

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;
897

Mikaël Salson's avatar
Mikaël Salson committed
898
    //overlap VJ
899
  seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
900
                                    box_V, box_J, segment_cost);
901

902
  // Why could this happen ?
903
904
      if (box_J->start>=(int) sequence.length())
	  box_J->start=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
905

906
  // seg_N will be recomputed in finishSegmentation()
Mikaël Salson's avatar
Mikaël Salson committed
907

908
  boxes.clear();
909
910
  boxes.push_back(box_V);
  boxes.push_back(box_J);
911
  code = codeFromBoxes(boxes, sequence_or_rc);
Mikaël Salson's avatar
Mikaël Salson committed
912

913
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
914
915
916
  finishSegmentation();
}

917
bool FineSegmenter::FineSegmentD(Germline *germline,
918
                                 AlignBox *box_Y, AlignBox *box_DD, AlignBox *box_Z,
919
                                 int forbidden_id,
920
                                 int extend_DD_on_Y, int extend_DD_on_Z,
921
                                 double evalue_threshold, int multiplier){
922

923
924
    // Create a zone where to look for D, adding some nucleotides on both sides
    int l = box_Y->end - extend_DD_on_Y;
Mikaël Salson's avatar
Mikaël Salson committed
925
926
927
    if (l<0) 
      l=0 ;

928
    int r = box_Z->start + extend_DD_on_Z;
Mikaël Salson's avatar
Mikaël Salson committed
929

930
931
932
933
    string seq = getSequence().sequence; // segmented sequence, possibly rev-comped

    if (r > (int) seq.length())
      r = seq.length();
Mikaël Salson's avatar
Mikaël Salson committed
934
      
935
    string str = seq.substr(l, r-l);
Mikaël Salson's avatar
Mikaël Salson committed
936
937

    // Align
938
    align_against_collection(str, germline->rep_4, forbidden_id, false, false, true,
939
                                           box_DD, segment_cost);
940

941
942
943
944
    box_DD->start += l ;
    box_DD->end += l ;

    float evalue_D = multiplier * (r-l) * germline->rep_4.totalSize() * segment_cost.toPValue(box_DD->score[0].first);
945

946
    if (evalue_D > evalue_threshold)
947
      return false;
948
949
950
951
952

    int save_box_Y_end = box_Y->end ;
    int save_box_Y_del_right = box_Y->del_right ;
    int save_box_Z_del_left = box_Z->del_left;
    int save_box_Z_start = box_Z->start ;
Mikaël Salson's avatar
Mikaël Salson committed
953
954
    
    //overlap VD
955
956
    seg_N1 = check_and_resolve_overlap(seq, 0, box_DD->end,
                                       box_Y, box_DD, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
957
958
    
    //overlap DJ
959
960
    seg_N2 = check_and_resolve_overlap(seq, box_DD->start, seq.length(),
                                       box_DD, box_Z, segment_cost);
961

962
963
964
965
966
967
968
969
970
    // Realign D to see whether the score is enough
    DynProg dp = DynProg(box_DD->getSequence(seq), box_DD->ref,
                         DynProg::SemiGlobal, segment_cost, false, false);
    int score_new = dp.compute();

    float evalue_DD_new = multiplier * (box_DD->end - box_DD->start + 1) * box_DD->ref.size() * segment_cost.toPValue(score_new);

    if (evalue_DD_new > evalue_threshold)
      {
971
972
973
974
975
976
        // Restore box_Y and box_Z
        box_Y->end =  save_box_Y_end;
        box_Y->del_right = save_box_Y_del_right;
        box_Z->del_left = save_box_Z_del_left;
        box_Z->start = save_box_Z_start;

977
978
979
        return false ;
      }

980
981
982
    return true;
}

983
984
void FineSegmenter::FineSegmentD(Germline *germline, bool several_D,
                                 double evalue_threshold, int multiplier){
985
986
987
988

  if (segmented){

    dSegmented = FineSegmentD(germline,
989
                              box_V, box_D, box_J,
990
                              NO_FORBIDDEN_ID,
991
                              EXTEND_D_ZONE, EXTEND_D_ZONE,
992
993
994
995
                              evalue_threshold, multiplier);

    if (!dSegmented)
      return ;
996

997
998
#define DD_MIN_SEARCH 5

999
    boxes.clear();
1000
    boxes.push_back(box_V);
1001

1002
    if (several_D && (box_D->start - box_V->end >= DD_MIN_SEARCH))
1003
      {
1004
1005
        AlignBox *box_D1 = new AlignBox("4a");

1006
1007
        bool d1 = FineSegmentD(germline,
                               box_V, box_D1, box_D,
1008
                               box_D->ref_nb,
1009
                               EXTEND_D_ZONE, 0,
1010
1011
1012
1013
                               evalue_threshold, multiplier);

        if (d1)
          boxes.push_back(box_D1);
1014
1015
        else
          delete box_D1;
1016
1017
      }

1018
    boxes.push_back(box_D);
1019

1020
    if (several_D && (box_J->start - box_D->end >= DD_MIN_SEARCH))
1021
      {