segment.cpp 29.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
3
  Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016 by Bonsai bioinformatics
4 5 6 7 8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
29
#include <cstring>
Marc Duez's avatar
Marc Duez committed
30
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
31

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48

AlignBox::AlignBox() {
  del_left = 0 ;
  start = 0 ;
  end = 0 ;
  del_right = 0 ;

  ref_nb = 0 ;
  ref = "";
  ref_label = "";
}

string AlignBox::getSequence(string sequence) {
  return sequence.substr(start, end-start+1);
}


49 50 51 52 53 54 55 56 57 58 59
ostream &operator<<(ostream &out, const AlignBox &box)
{
  out << "[/" << box.del_left << " " ;
  out << "@" << box.start << " " ;
  out << box.ref_label << "(" << box.ref_nb << ") " ;
  out << "@" << box.end << " " ;
  out << box.del_right << "/]" ;

  return out ;
}

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
string codeFromBoxes(vector <AlignBox*> boxes, string sequence)
{
  string code = "";

  int n = boxes.size();

  for (int i=0; i<n; i++) {

    if (i>0) {
      code += " " + string_of_int(boxes[i-1]->del_right) + "/"
        // From box_left->end + 1 to box_right->start - 1
        + sequence.substr(boxes[i-1]->end + 1, boxes[i]->start - boxes[i-1]->end - 1)
        + "/" + string_of_int(boxes[i]->del_left) + " " ;
    }

    code += boxes[i]->ref_label ;
  }

  return code;
}
80

81

Mathieu Giraud's avatar
Mathieu Giraud committed
82
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
83

Mathieu Giraud's avatar
Mathieu Giraud committed
84
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
105
  return box_V->end;
Mikaël Salson's avatar
Mikaël Salson committed
106 107 108
}
  
int Segmenter::getRight() const {
109
  return box_J->start;
Mikaël Salson's avatar
Mikaël Salson committed
110 111 112
}

int Segmenter::getLeftD() const {
113
  return box_D->start;
Mikaël Salson's avatar
Mikaël Salson committed
114 115 116
}
  
int Segmenter::getRightD() const {
117
  return box_D->end;
Mikaël Salson's avatar
Mikaël Salson committed
118 119 120 121 122 123 124 125 126 127 128 129 130 131
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

// E-values

void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
  if (threshold == NO_LIMIT_VALUE)
    return ;

  if (evalue_left >= threshold && evalue_right >= threshold)
    because = UNSEG_TOO_FEW_ZERO ;
  else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
    because = UNSEG_TOO_FEW_V ;
  else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
    because = UNSEG_TOO_FEW_J ;
  else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
    because = UNSEG_TOO_FEW_ZERO ;
}


Mikaël Salson's avatar
Mikaël Salson committed
151 152 153 154 155 156 157 158
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

159
  if (box_V->end >= box_J->start)
Mikaël Salson's avatar
Mikaël Salson committed
160
    {
161 162 163 164
      int middle = (box_V->end + box_J->start) / 2 ;
      chevauchement = " !ov " + string_of_int (box_V->end - box_J->start + 1);
      box_V->end = middle ;
      box_J->start = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
165 166 167 168 169 170 171 172 173 174 175 176 177 178
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
179 180 181 182 183
  seg_V = seq.substr(0, box_V->end+1) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(box_J->start) ;
  box_D->start=0;
  box_D->end=0;
Mikaël Salson's avatar
Mikaël Salson committed
184 185 186 187 188 189 190 191 192 193 194 195 196

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

197 198 199 200
  seg_V = seq.substr(0, box_V->end+1) ; // From pos. 0 to box_V->end
  seg_J = seq.substr(box_J->start) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_D  = seq.substr(box_D->start, box_D->end-box_D->start+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
201
  
202 203 204 205
  info = "VDJ \t0 " + string_of_int(box_V->end) +
                " " + string_of_int(box_D->start) +
		" " + string_of_int(box_D->end) +
		" " + string_of_int(box_J->start) +
Mikaël Salson's avatar
Mikaël Salson committed
206 207 208 209 210 211 212 213 214
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

215 216
string Segmenter::getInfoLine() const
{
217
  string s = "" ;
218 219 220 221 222

  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
223 224 225 226

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

227 228 229 230 231
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

232 233 234
  return s ;
}

235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
string KmerSegmenter::getInfoLineWithAffects() const
{
   stringstream ss;

   ss << "# "
      << right << setw(3) << score << " "
      << left << setw(30)
      << getInfoLine() ;

   if (getSegmentationStatus() != UNSEG_TOO_SHORT)
     ss << getKmerAffectAnalyser()->toString();

   return ss.str();
}


Mikaël Salson's avatar
Mikaël Salson committed
251 252
ostream &operator<<(ostream &out, const Segmenter &s)
{
253
  out << ">" << s.label << " " ;
254
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
255 256 257 258 259 260 261 262 263

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
264
      out << s.getSequence().sequence << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
265 266 267 268 269 270 271 272
    }

  return out ;
}


// KmerSegmenter (Cheap)

273 274
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

275
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
276
{
277 278 279 280
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

Mikaël Salson's avatar
Mikaël Salson committed
281 282 283
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
284
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
285
  segmented = false;
286
  segmented_germline = germline ;
287
  system = germline->code; // useful ?
288
  reversed = false;
289
  because = NOT_PROCESSED ; // Cause of unsegmentation
290
  score = 0 ;
291
  evalue = NO_LIMIT_VALUE;
292 293
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
294

295
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
296
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
297

298
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
299
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
300 301
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
302
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
303 304
    }
 
305
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
306 307
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
308 309 310 311 312 313 314 315 316
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
317 318 319
    }
  }

320 321
  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

322 323
  reversed = (nb_strand[0] > nb_strand[1]) ;

324 325
  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
326 327 328 329
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
      CountKmerAffectAnalyser ckaa(*(germline->index), sequence);

330 331 332 333 334

      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

335
      if (germline->seg_method == SEG_METHOD_MAX12)
336 337 338
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);
339

340 341 342 343 344 345
          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }
346

347 348
      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
349
        {
350 351 352 353 354 355 356 357 358
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
359 360
        }

361 362 363 364 365
      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

366 367 368
      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
369
      strand = reversed ? -1 : 1 ;
370 371 372 373 374
    }

  else
    { // Regular germline

375
  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
376
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
377
    because = UNSEG_TOO_FEW_ZERO ;
378
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
379 380
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
381 382
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
383 384
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
385 386
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
387 388
  } else {
    // Ambiguous information: we have positive and negative strands
389 390 391 392 393
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
394
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
395 396
  }

397
    } // endif Pseudo-germline
398
 
399
  computeSegmentation(strand, before, after, threshold, multiplier);
Mathieu Giraud's avatar
Mathieu Giraud committed
400
}
Mikaël Salson's avatar
Mikaël Salson committed
401

Mathieu Giraud's avatar
Mathieu Giraud committed
402
KmerSegmenter::~KmerSegmenter() {
403 404
  if (kaa)
    delete kaa;
405 406
}

407 408
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                                       double threshold, int nb_reads_for_evalue)
409
{
410 411
  bool found_seg = false ; // Found a segmentation
  double best_evalue_seg = NO_LIMIT_VALUE ; // Best evalue, segmented sequences
412
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
413
  the_kseg = NULL;
414
  multi_germline = multigermline;
415
  threshold_nb_expected = threshold;
416 417 418

  // E-value multiplier
  int multiplier = multi_germline->germlines.size() * nb_reads_for_evalue;
419
  
420 421 422 423 424
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

425
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, threshold, multiplier);
426
      bool keep_seg = false;
427

428 429 430
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
431
          *out_unsegmented << kseg->getInfoLineWithAffects() << endl ;
432 433
        }

434 435
      // Always remember the first kseg
      if (the_kseg == NULL)
436
        keep_seg = true;
437
      
438
      if (kseg->isSegmented())
439 440
        {
          // Yes, it is segmented
441
          // Should we keep the kseg ?
442
          if (!found_seg || (kseg->evalue < best_evalue_seg))
443
            {
444
              keep_seg = true;
445 446 447
              best_evalue_seg = kseg->evalue ;

              found_seg = true;
448
            }
449
        }
450 451 452 453 454 455 456
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
457
              if (!found_seg)
458 459 460 461
                keep_seg = true;
            }
        }
      
462
      if (keep_seg) {
463 464
        if (the_kseg)
          delete the_kseg;
465 466 467 468
        the_kseg = kseg;
      } else {
        delete kseg;
      }
469 470 471
    } // end for (Germlines)
}

472
KmerMultiSegmenter::~KmerMultiSegmenter() {
473 474
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
475 476
}

477 478
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
                                        double threshold, int multiplier) {
479
  // Try to segment, computing 'box_V->end' and 'box_J->start'
480 481
  // If not segmented, put the cause of unsegmentation in 'because'

482
  affect_infos max;
483
  max = kaa->getMaximum(before, after);
Mikaël Salson's avatar
Mikaël Salson committed
484

485 486 487
  // We did not find a good segmentation point
  if (!max.max_found) {
    // We labeled it detected if there were both enough affect_5 and enough affect_3
488 489 490 491 492 493 494 495 496
    bool detected_before = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD);
    bool detected_after = (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);

    if (detected_before && detected_after)
      because = UNSEG_AMBIGUOUS ;
    else if ((strand == 1 && detected_before) || (strand == -1 && detected_after))
      because = UNSEG_TOO_FEW_J ;
    else if ((strand == 1 && detected_after) || (strand == -1 && detected_before))
      because = UNSEG_TOO_FEW_V ;
497
    else
498
      because = UNSEG_TOO_FEW_ZERO ;
499 500 501 502 503

    return ;
  }


504
  // E-values
505 506 507
  pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
  evalue_left = pvalues.first * multiplier ;
  evalue_right = pvalues.second * multiplier ;
508
  evalue = evalue_left + evalue_right ;
509

510
  checkLeftRightEvaluesThreshold(threshold, strand);
511

512
  if (because != NOT_PROCESSED)
513 514
    return ;

515 516
   // There was a good segmentation point

517 518
   box_V->end = max.first_pos_max;
   box_J->start = max.last_pos_max + 1;
519
   if (strand == -1) {
520 521 522
     int tmp = sequence.size() - box_V->end - 1;
     box_V->end = sequence.size() - box_J->start - 1;
     box_J->start = tmp;
523 524 525 526 527 528
   }

  // Yes, it is segmented
  segmented = true;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

529
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS)  ;
530 531 532 533 534 535

  // removeChevauchement is called once info was already computed: it is only to output info_extra
  info_extra += removeChevauchement();
  finishSegmentation();

  return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
536
}
Mikaël Salson's avatar
Mikaël Salson committed
537

538
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
539 540
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
541

542
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
543
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
544 545
}

546 547 548 549 550
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
551 552
// FineSegmenter

553

554
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
555 556
                                 AlignBox *box_left, AlignBox *box_right,
                                 Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
557
{
558
  // Overlap size
559
  int overlap = box_left->end - box_right->start + 1;
560 561 562

  if (overlap > 0)
    {
563 564
      string seq_left = seq.substr(seq_begin, box_left->end - seq_begin + 1);
      string seq_right = seq.substr(box_right->start, seq_end - box_right->start + 1);
565

Mikaël Salson's avatar
Mikaël Salson committed
566 567 568 569
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
570
      DynProg dp_l = DynProg(seq_left, box_left->ref,
Mikaël Salson's avatar
Mikaël Salson committed
571
			   DynProg::Local, segment_cost);
572
      score_l[0] = dp_l.compute();
573

Mikaël Salson's avatar
Mikaël Salson committed
574 575
      
      //RIGHT
576
      // reverse right sequence
577
      string ref_right=string(box_right->ref.rbegin(), box_right->ref.rend());
578 579 580
      seq_right=string(seq_right.rbegin(), seq_right.rend());


581
      DynProg dp_r = DynProg(seq_right, ref_right,
Mikaël Salson's avatar
Mikaël Salson committed
582
			   DynProg::Local, segment_cost);
583
      score_r[0] = dp_r.compute();
584 585 586 587 588 589 590 591 592



      int trim_l[overlap+1];
      int trim_r[overlap+1];

      for(int i=0; i<=overlap; i++) {
        score_l[i] = i < seq_left.size()  ? dp_l.best_score_on_i(seq_left.size()  - i, trim_l + i) : MINUS_INF ;
        score_r[i] = i < seq_right.size() ? dp_r.best_score_on_i(seq_right.size() - i, trim_r + i) : MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
593
      }
594

595 596 597 598 599 600

// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
      cout << dp_l ;
      cout << dp_r ;

601 602
      cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
      cout << "ref:" << ref_left << "\t\t" << ref_right << endl;
603 604 605 606 607 608 609 610
      for(int i=0; i<=overlap; i++)
        cout << i << "  left: " << score_l[i] << "/" << trim_l[i] << "     right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif

      int score = MINUS_INF;
      int best_i = 0 ;
      int best_j = 0 ;

Mikaël Salson's avatar
Mikaël Salson committed
611

612
      // Find (i, j), with i+j >= overlap,
613
      // maximizing score_l[j] + score_r[i]
Mikaël Salson's avatar
Mikaël Salson committed
614 615
      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
616
          int score_ij = score_l[i] + score_r[j];
617

618 619 620
	  if (score_ij > score) {
            best_i = i ;
            best_j = j ;
621 622
            box_left->del_right = box_left->ref.size() - trim_l[i];
	    box_right->del_left = box_right->ref.size() - trim_r[j];
623
	    score = score_ij;
Mikaël Salson's avatar
Mikaël Salson committed
624 625 626
	  }
	}
      }
627

628 629
      box_left->end -= best_i ;
      box_right->start += best_j ;
630 631

#ifdef DEBUG_OVERLAP
632
      cout << "overlap: " << overlap << ", " << "best_overlap_split: " << score
633 634
           << "    left: " << best_i << "-" << box_left->del_right << " @" << box_left->end
           << "    right:" << best_j << "-" << box_right->del_left << " @" << box_right->start
635
           << endl;
636
#endif
637 638
    } // end if (overlap > 0)

639 640
  // From box_left->end + 1 to box_right->start - 1
  return seq.substr(box_left->end + 1, box_right->start - box_left->end - 1);
Mikaël Salson's avatar
Mikaël Salson committed
641 642
}

Mikaël Salson's avatar
Mikaël Salson committed
643 644 645 646 647
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

648 649 650 651 652

/**
 * Align a read against a collection of sequences, maximizing the alignment 'score'
 * @param read:         the read
 * @param rep:          a collection of reference sequences
653
 * @param reverse_ref:  if true, reverse the reference sequences (VkVk)
654
 * @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
655
 * @param local:        if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions and onlyBottomTriangle (V and J segments)
656
 * @param box:          the AligBox to fill
657
 * @param segment_cost: the cost used by the dynamic programing
658
 * @post  box is filled
659 660
 */

661
void align_against_collection(string &read, Fasta &rep, bool reverse_ref, bool reverse_both, bool local,
662
                             AlignBox *box, Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
663 664 665
{
  
  int best_score = MINUS_INF ;
666
  box->ref_nb = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
667 668 669 670
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
671

Mikaël Salson's avatar
Mikaël Salson committed
672
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
673 674 675

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
676 677 678

  // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence
  string sequence_or_rc = revcomp(read, reverse_ref);
Mikaël Salson's avatar
Mikaël Salson committed
679 680 681
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
682
      DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
Mikaël Salson's avatar
Mikaël Salson committed
683 684 685
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
686 687 688

      bool onlyBottomTriangle = !local ;
      int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
Mikaël Salson's avatar
Mikaël Salson committed
689
      
Mikaël Salson's avatar
Mikaël Salson committed
690 691 692 693 694 695 696 697 698 699 700
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
701 702
	  box->ref_nb = r ;
	  box->ref_label = rep.label(r) ;
Mikaël Salson's avatar
Mikaël Salson committed
703
	}
Mikaël Salson's avatar
Mikaël Salson committed
704 705
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
706 707 708 709 710 711

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
712 713

    }
Mikaël Salson's avatar
Mikaël Salson committed
714
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
715

716 717 718 719
  box->ref = rep.sequence(box->ref_nb);
  box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
  box->del_left = best_first_j;
  box->start = best_first_i;
Mikaël Salson's avatar
Mikaël Salson committed
720
  
721
  box->score = score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
722 723 724

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
725
  cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
726 727
  cout << endl;
#endif
728 729 730 731

  if (reverse_ref)
    // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
    best_best_i = read.length() - best_best_i - 1 ;
732 733

  box->end = best_best_i ;
Mikaël Salson's avatar
Mikaël Salson committed
734 735 736 737 738 739 740
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

741
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
742
{
743 744 745 746
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

747 748
  segmented = false;
  dSegmented = false;
749
  because = NOT_PROCESSED ;
750
  segmented_germline = germline ;
751
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
752 753 754
  label = seq.label ;
  sequence = seq.sequence ;
  segment_cost=segment_c;
755
  evalue = NO_LIMIT_VALUE;
756 757
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
758

759 760
  CDR3start = -1;
  CDR3end = -1;
761

762 763 764
  bool reverse_V = false ;
  bool reverse_J = false ;

765
  if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U))
766
    {
767
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
768
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
769
      if (kseg->isSegmented())
770
        {
771 772 773 774 775
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

776 777
          delete kseg ;

778 779 780
          reverse_V = (left.getStrand() == -1);
          reverse_J = (right.getStrand() == -1);

781
          code = "Unexpected ";
782

783 784 785 786
          code += left.toStringSigns() + germline->index->getLabel(left).name;
          code += "/";
          code += right.toStringSigns() + germline->index->getLabel(right).name;
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";
787 788 789 790 791

          if (germline->seg_method == SEG_METHOD_MAX1U)
            return ;

          germline->override_rep5_rep3_from_labels(left, right);
792
        }
793
      else
794 795 796 797
        {
          delete kseg ;
          return ;
        }
798
    }
799

800 801 802 803 804 805 806
  // Strand determination, with KmerSegmenter (with default e-value parameters)
  // Note that we use only the 'strand' component
  // When the KmerSegmenter fails, continue with positive strand
  // TODO: flag to force a strand / to test both strands ?

  KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
  reversed = kseg->isReverse();
807
  delete kseg ;
Mikaël Salson's avatar
Mikaël Salson committed
808
  
809 810
  string sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed

Mathieu Giraud's avatar
Mathieu Giraud committed
811 812

  /* Segmentation */
813
  align_against_collection(sequence_or_rc, germline->rep_5, reverse_V, reverse_V, false,
814
                                        box_V, segment_cost);
815

816
  align_against_collection(sequence_or_rc, germline->rep_3, reverse_J, !reverse_J, false,
817
                                          box_J, segment_cost);
818 819 820 821 822

  // J was run with '!reverseJ', we copy the box informations from right to left
  // Should this directly be handled in align_against_collection() ?
  box_J->start = box_J->end ;
  box_J->del_left = box_J->del_right;
823

824
  /* E-values */
825 826
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first);
827
  evalue = evalue_left + evalue_right ;
828 829

  /* Unsegmentation causes */
830
  if (box_V->end == (int) string::npos)
831
    {
832
      evalue_left = BAD_EVALUE ;
833
    }
834
      
835
  if (box_J->start == (int) string::npos)
836
    {
837
      evalue_right = BAD_EVALUE ;
838 839
    }

840 841
  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

842 843 844
  if (because != NOT_PROCESSED)
    {
      segmented = false;
845
      info = " @" + string_of_int (box_V->end + FIRST_POS) + "  @" + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
846 847
      return ;
    }
848 849 850 851

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;
852

Mikaël Salson's avatar
Mikaël Salson committed
853
    //overlap VJ
854
  seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
855
                                    box_V, box_J, segment_cost);
856

857
  // Why could this happen ?
858 859
      if (box_J->start>=(int) sequence.length())
	  box_J->start=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
860

861
  // seg_N will be recomputed in finishSegmentation()
Mikaël Salson's avatar
Mikaël Salson committed
862

863 864 865 866
  vector <AlignBox*> boxes ;
  boxes.push_back(box_V);
  boxes.push_back(box_J);
  code = codeFromBoxes(boxes, sequence);
Mikaël Salson's avatar
Mikaël Salson committed
867

868
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
869 870 871
  finishSegmentation();
}

872
bool FineSegmenter::FineSegmentD(Germline *germline,
873
                                 AlignBox *box_Y, AlignBox *box_DD, AlignBox *box_Z,
874
                                 double evalue_threshold, int multiplier){
875

Mikaël Salson's avatar
Mikaël Salson committed
876
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
877
    int l = box_Y->end - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
878 879 880
    if (l<0) 
      l=0 ;

881
    int r = box_Z->start + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
882

883 884 885 886
    string seq = getSequence().sequence; // segmented sequence, possibly rev-comped

    if (r > (int) seq.length())
      r = seq.length();
Mikaël Salson's avatar
Mikaël Salson committed
887
      
888
    string str = seq.substr(l, r-l);
Mikaël Salson's avatar
Mikaël Salson committed
889 890

    // Align
891
    align_against_collection(str, germline->rep_4, false, false, true,
892
                                           box_DD, segment_cost);
893

894 895 896 897
    box_DD->start += l ;
    box_DD->end += l ;

    float evalue_D = multiplier * (r-l) * germline->rep_4.totalSize() * segment_cost.toPValue(box_DD->score[0].first);
898

899
    if (evalue_D > evalue_threshold)
900
      return false;
Mikaël Salson's avatar
Mikaël Salson committed
901 902
    
    //overlap VD
903 904
    seg_N1 = check_and_resolve_overlap(seq, 0, box_DD->end,
                                       box_Y, box_DD, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
905 906
    
    //overlap DJ
907 908
    seg_N2 = check_and_resolve_overlap(seq, box_DD->start, seq.length(),
                                       box_DD, box_Z, segment_cost);
909 910 911 912 913 914 915 916 917

    return true;
}

void FineSegmenter::FineSegmentD(Germline *germline, double evalue_threshold, int multiplier){

  if (segmented){

    dSegmented = FineSegmentD(germline,
918
                              box_V, box_D, box_J,
919 920 921 922
                              evalue_threshold, multiplier);

    if (!dSegmented)
      return ;
923

924 925 926 927 928 929
    vector <AlignBox*> boxes ;
    boxes.push_back(box_V);
    boxes.push_back(box_D);
    boxes.push_back(box_J);
    code = codeFromBoxes(boxes, sequence);

Mikaël Salson's avatar
Mikaël Salson committed
930 931 932
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
933

Marc Duez's avatar
Marc Duez committed
934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
953
        while ( loc != string::npos && loc < (size_t)box_V->end){
Marc Duez's avatar
Marc Duez committed
954
            loc = str.find(*it, loc+3);
955
            if (loc != string::npos && loc < (size_t)box_V->end) {
Marc Duez's avatar
Marc Duez committed
956 957 958 959 960 961
                p_start.push_front(loc);
            }
        }
    }

    for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
962
        loc = box_J->start;
Marc Duez's avatar
Marc Duez committed
963 964 965 966 967 968 969 970
        while ( loc != string::npos){
            loc = str.find(*it, loc+3);
            if (loc != string::npos) {
                p_end.push_back(loc);
            }
        }
    }

971 972
    CDR3start = -1;
    CDR3end = -1;
Marc Duez's avatar
Marc Duez committed
973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988